diff options
author | Glenn Adams <gadams@apache.org> | 2014-10-12 04:18:37 +0000 |
---|---|---|
committer | Glenn Adams <gadams@apache.org> | 2014-10-12 04:18:37 +0000 |
commit | cb807d07f176fd1f852c0a8fe6e99af43fa090bb (patch) | |
tree | 85c96317c7b2494d0c95f54859b0a063657517e0 | |
parent | f03a1498ffa126a24ed9234b8cbc17ea648f958d (diff) | |
download | xmlgraphics-fop-cb807d07f176fd1f852c0a8fe6e99af43fa090bb.tar.gz xmlgraphics-fop-cb807d07f176fd1f852c0a8fe6e99af43fa090bb.zip |
FOP-2287: support Tamil script (preliminary)
git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@1631147 13f79535-47bb-0310-9956-ffa450edef68
9 files changed, 728 insertions, 25 deletions
diff --git a/src/java/org/apache/fop/complexscripts/fonts/GlyphDefinitionTable.java b/src/java/org/apache/fop/complexscripts/fonts/GlyphDefinitionTable.java index a5942536c..2ed1c2875 100644 --- a/src/java/org/apache/fop/complexscripts/fonts/GlyphDefinitionTable.java +++ b/src/java/org/apache/fop/complexscripts/fonts/GlyphDefinitionTable.java @@ -96,14 +96,15 @@ public class GlyphDefinitionTable extends GlyphTable { * method since when the segment is reversed by BIDI processing, marks are automatically reordered to precede * their base glyph. * @param gs an input glyph sequence + * @param widths associated advance widths (also reordered) * @param gpa associated glyph position adjustments (also reordered) * @param script a script identifier * @param language a language identifier * @return the reordered (output) glyph sequence */ - public GlyphSequence reorderCombiningMarks(GlyphSequence gs, int[][] gpa, String script, String language) { + public GlyphSequence reorderCombiningMarks(GlyphSequence gs, int[] widths, int[][] gpa, String script, String language) { ScriptProcessor sp = ScriptProcessor.getInstance(script); - return sp.reorderCombiningMarks(this, gs, gpa, script, language); + return sp.reorderCombiningMarks(this, gs, widths, gpa, script, language); } /** {@inheritDoc} */ diff --git a/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java index 2753ca945..b108c5ebe 100644 --- a/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java +++ b/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java @@ -141,7 +141,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { /** {@inheritDoc} */ @Override - public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) { + public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[] widths, int[][] gpa, String script, String language) { // a side effect of BIDI reordering is to order combining marks before their base, so we need to override the default here to // prevent double reordering return gs; diff --git a/src/java/org/apache/fop/complexscripts/scripts/DefaultScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/DefaultScriptProcessor.java index ced4d4041..275fb20dc 100644 --- a/src/java/org/apache/fop/complexscripts/scripts/DefaultScriptProcessor.java +++ b/src/java/org/apache/fop/complexscripts/scripts/DefaultScriptProcessor.java @@ -80,14 +80,15 @@ public class DefaultScriptProcessor extends ScriptProcessor { @Override /** {@inheritDoc} */ - public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) { + public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[] unscaledWidths, int[][] gpa, String script, String language) { int ng = gs.getGlyphCount(); int[] ga = gs.getGlyphArray(false); int nm = 0; // count combining marks for (int i = 0; i < ng; i++) { int gid = ga [ i ]; - if (gdef.isGlyphClass(gid, GlyphDefinitionTable.GLYPH_CLASS_MARK)) { + int gw = unscaledWidths [ i ]; + if (isReorderedMark(gdef, ga, unscaledWidths, i)) { nm++; } } @@ -105,7 +106,7 @@ public class DefaultScriptProcessor extends ScriptProcessor { int gid = ga [ i ]; int[] pa = (gpa != null) ? gpa [ i ] : null; CharAssociation ca = aa [ i ]; - if (gdef.isGlyphClass(gid, GlyphDefinitionTable.GLYPH_CLASS_MARK)) { + if (isReorderedMark(gdef, ga, unscaledWidths, i)) { nga [ k ] = gid; naa [ k ] = ca; if (npa != null) { @@ -149,4 +150,8 @@ public class DefaultScriptProcessor extends ScriptProcessor { } } + protected boolean isReorderedMark(GlyphDefinitionTable gdef, int[] glyphs, int[] unscaledWidths, int index) { + return gdef.isGlyphClass(glyphs[index], GlyphDefinitionTable.GLYPH_CLASS_MARK) && (unscaledWidths[index] != 0); + } + } diff --git a/src/java/org/apache/fop/complexscripts/scripts/GurmukhiScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/GurmukhiScriptProcessor.java index 94f5893a9..cc50761bb 100644 --- a/src/java/org/apache/fop/complexscripts/scripts/GurmukhiScriptProcessor.java +++ b/src/java/org/apache/fop/complexscripts/scripts/GurmukhiScriptProcessor.java @@ -22,7 +22,6 @@ package org.apache.fop.complexscripts.scripts; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.fop.complexscripts.fonts.GlyphDefinitionTable; import org.apache.fop.complexscripts.util.CharAssociation; import org.apache.fop.complexscripts.util.GlyphSequence; @@ -538,9 +537,4 @@ public class GurmukhiScriptProcessor extends IndicScriptProcessor { return hasFlag(c, C_N); } - @Override - public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) { - return super.reorderCombiningMarks(gdef, gs, gpa, script, language); - } - } diff --git a/src/java/org/apache/fop/complexscripts/scripts/IndicScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/IndicScriptProcessor.java index ad533b843..01d5c6d70 100644 --- a/src/java/org/apache/fop/complexscripts/scripts/IndicScriptProcessor.java +++ b/src/java/org/apache/fop/complexscripts/scripts/IndicScriptProcessor.java @@ -126,6 +126,9 @@ public class IndicScriptProcessor extends DefaultScriptProcessor { case CharScript.SCRIPT_GURMUKHI: case CharScript.SCRIPT_GURMUKHI_2: return new GurmukhiScriptProcessor(script); + case CharScript.SCRIPT_TAMIL: + case CharScript.SCRIPT_TAMIL_2: + return new TamilScriptProcessor(script); // [TBD] implement other script processors default: return new IndicScriptProcessor(script); @@ -460,22 +463,30 @@ public class IndicScriptProcessor extends DefaultScriptProcessor { Vector<Segment> sv = new Vector<Segment>(nc); for (int s = 0, e = nc; s < e; ) { int i; - if ((i = findStartOfSyllable(ca, s, e)) > s) { - // from s to i is non-syllable segment - sv.add(new Segment(s, i, Segment.OTHER)); + if ((i = findStartOfSyllable(ca, s, e)) < e) { + if (s < i) { + // from s to i is non-syllable segment + sv.add(new Segment(s, i, Segment.OTHER)); + } s = i; // move s to start of syllable - } else if (i > s) { - // from s to e is non-syllable segment - sv.add(new Segment(s, e, Segment.OTHER)); + } else { + if (s < e) { + // from s to e is non-syllable segment + sv.add(new Segment(s, e, Segment.OTHER)); + } s = e; // move s to end of input sequence } if ((i = findEndOfSyllable(ca, s, e)) > s) { - // from s to i is syllable segment - sv.add(new Segment(s, i, Segment.SYLLABLE)); + if (s < i) { + // from s to i is syllable segment + sv.add(new Segment(s, i, Segment.SYLLABLE)); + } s = i; // move s to end of syllable } else { - // from s to e is non-syllable segment - sv.add(new Segment(s, e, Segment.OTHER)); + if (s < e) { + // from s to e is non-syllable segment + sv.add(new Segment(s, e, Segment.OTHER)); + } s = e; // move s to end of input sequence } } diff --git a/src/java/org/apache/fop/complexscripts/scripts/ScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/ScriptProcessor.java index cfcc4ff59..d492045f1 100644 --- a/src/java/org/apache/fop/complexscripts/scripts/ScriptProcessor.java +++ b/src/java/org/apache/fop/complexscripts/scripts/ScriptProcessor.java @@ -123,12 +123,13 @@ public abstract class ScriptProcessor { * their base glyph. * @param gdef the glyph definition table that applies * @param gs an input glyph sequence + * @param unscaledWidths associated unscaled advance widths (also reordered) * @param gpa associated glyph position adjustments (also reordered) * @param script a script identifier * @param language a language identifier * @return the reordered (output) glyph sequence */ - public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[][] gpa, String script, String language) { + public GlyphSequence reorderCombiningMarks(GlyphDefinitionTable gdef, GlyphSequence gs, int[] unscaledWidths, int[][] gpa, String script, String language) { return gs; } diff --git a/src/java/org/apache/fop/complexscripts/scripts/TamilScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/TamilScriptProcessor.java new file mode 100644 index 000000000..6df0bf8ee --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/scripts/TamilScriptProcessor.java @@ -0,0 +1,542 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.scripts; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.fop.complexscripts.util.CharAssociation; +import org.apache.fop.complexscripts.util.GlyphSequence; + +// CSOFF: LineLengthCheck + +/** + * <p>The <code>TamilScriptProcessor</code> class implements a script processor for + * performing glyph substitution and positioning operations on content associated with the Tamil script.</p> + * + * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p> + */ +public class TamilScriptProcessor extends IndicScriptProcessor { + + /** logging instance */ + private static final Log log = LogFactory.getLog(TamilScriptProcessor.class); + + TamilScriptProcessor(String script) { + super(script); + } + + @Override + protected Class<? extends TamilSyllabizer> getSyllabizerClass() { + return TamilSyllabizer.class; + } + + @Override + // find rightmost pre-base matra + protected int findPreBaseMatra(GlyphSequence gs) { + int ng = gs.getGlyphCount(); + int lk = -1; + for (int i = ng; i > 0; i--) { + int k = i - 1; + if (containsPreBaseMatra(gs, k)) { + lk = k; + break; + } + } + return lk; + } + + @Override + // find leftmost pre-base matra target, starting from source + protected int findPreBaseMatraTarget(GlyphSequence gs, int source) { + int ng = gs.getGlyphCount(); + int lk = -1; + for (int i = (source < ng) ? source : ng; i > 0; i--) { + int k = i - 1; + if (containsConsonant(gs, k)) { + if (containsHalfConsonant(gs, k)) { + lk = k; + } else if (lk == -1) { + lk = k; + } else { + break; + } + } + } + return lk; + } + + private static boolean containsPreBaseMatra(GlyphSequence gs, int k) { + CharAssociation a = gs.getAssociation(k); + int[] ca = gs.getCharacterArray(false); + for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { + if (isPreM(ca [ i ])) { + return true; + } + } + return false; + } + + private static boolean containsConsonant(GlyphSequence gs, int k) { + CharAssociation a = gs.getAssociation(k); + int[] ca = gs.getCharacterArray(false); + for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { + if (isC(ca [ i ])) { + return true; + } + } + return false; + } + + private static boolean containsHalfConsonant(GlyphSequence gs, int k) { + Boolean half = (Boolean) gs.getAssociation(k).getPredication("half"); + return (half != null) ? half.booleanValue() : false; + } + + @Override + protected int findReph(GlyphSequence gs) { + int ng = gs.getGlyphCount(); + int li = -1; + for (int i = 0; i < ng; i++) { + if (containsReph(gs, i)) { + li = i; + break; + } + } + return li; + } + + @Override + protected int findRephTarget(GlyphSequence gs, int source) { + int ng = gs.getGlyphCount(); + int c1 = -1; + int c2 = -1; + // first candidate target is after first non-half consonant + for (int i = 0; i < ng; i++) { + if ((i != source) && containsConsonant(gs, i)) { + if (!containsHalfConsonant(gs, i)) { + c1 = i + 1; + break; + } + } + } + // second candidate target is after last non-prebase matra after first candidate or before first syllable or vedic mark + for (int i = (c1 >= 0) ? c1 : 0; i < ng; i++) { + if (containsMatra(gs, i) && !containsPreBaseMatra(gs, i)) { + c2 = i + 1; + } else if (containsOtherMark(gs, i)) { + c2 = i; + break; + } + } + if (c2 >= 0) { + return c2; + } else if (c1 >= 0) { + return c1; + } else { + return source; + } + } + + private static boolean containsReph(GlyphSequence gs, int k) { + Boolean rphf = (Boolean) gs.getAssociation(k).getPredication("rphf"); + return (rphf != null) ? rphf.booleanValue() : false; + } + + private static boolean containsMatra(GlyphSequence gs, int k) { + CharAssociation a = gs.getAssociation(k); + int[] ca = gs.getCharacterArray(false); + for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { + if (isM(ca [ i ])) { + return true; + } + } + return false; + } + + private static boolean containsOtherMark(GlyphSequence gs, int k) { + CharAssociation a = gs.getAssociation(k); + int[] ca = gs.getCharacterArray(false); + for (int i = a.getStart(), e = a.getEnd(); i < e; i++) { + switch (typeOf(ca [ i ])) { + case C_T: // tone (e.g., udatta, anudatta) + case C_A: // accent (e.g., acute, grave) + case C_O: // other (e.g., candrabindu, anusvara, visarga, etc) + return true; + default: + break; + } + } + return false; + } + + private static class TamilSyllabizer extends DefaultSyllabizer { + TamilSyllabizer(String script, String language) { + super(script, language); + } + @Override + // | C ... + protected int findStartOfSyllable(int[] ca, int s, int e) { + if ((s < 0) || (s >= e)) { + return -1; + } else { + while (s < e) { + int c = ca [ s ]; + if (isC(c)) { + break; + } else { + s++; + } + } + return s; + } + } + @Override + // D* L? | ... + protected int findEndOfSyllable(int[] ca, int s, int e) { + if ((s < 0) || (s >= e)) { + return -1; + } else { + int nd = 0; + int nl = 0; + int i; + // consume dead consonants + while ((i = isDeadConsonant(ca, s, e)) > s) { + s = i; + nd++; + } + // consume zero or one live consonant + if ((i = isLiveConsonant(ca, s, e)) > s) { + s = i; + nl++; + } + return ((nd > 0) || (nl > 0)) ? s : -1; + } + } + // D := ( C N? H )? + private int isDeadConsonant(int[] ca, int s, int e) { + if (s < 0) { + return -1; + } else { + int c; + int i = 0; + int nc = 0; + int nh = 0; + do { + // C + if ((s + i) < e) { + c = ca [ s + i ]; + if (isC(c)) { + i++; + nc++; + } else { + break; + } + } + // N? + if ((s + i) < e) { + c = ca [ s + 1 ]; + if (isN(c)) { + i++; + } + } + // H + if ((s + i) < e) { + c = ca [ s + i ]; + if (isH(c)) { + i++; + nh++; + } else { + break; + } + } + } while (false); + return (nc > 0) && (nh > 0) ? s + i : -1; + } + } + // L := ( (C|V) N? X* )?; where X = ( MATRA | ACCENT MARK | TONE MARK | OTHER MARK ) + private int isLiveConsonant(int[] ca, int s, int e) { + if (s < 0) { + return -1; + } else { + int c; + int i = 0; + int nc = 0; + int nv = 0; + int nx = 0; + do { + // C + if ((s + i) < e) { + c = ca [ s + i ]; + if (isC(c)) { + i++; + nc++; + } else if (isV(c)) { + i++; + nv++; + } else { + break; + } + } + // N? + if ((s + i) < e) { + c = ca [ s + i ]; + if (isN(c)) { + i++; + } + } + // X* + while ((s + i) < e) { + c = ca [ s + i ]; + if (isX(c)) { + i++; + nx++; + } else { + break; + } + } + } while (false); + // if no X but has H, then ignore C|I + if (nx == 0) { + if ((s + i) < e) { + c = ca [ s + i ]; + if (isH(c)) { + if (nc > 0) { + nc--; + } else if (nv > 0) { + nv--; + } + } + } + } + return ((nc > 0) || (nv > 0)) ? s + i : -1; + } + } + } + + // tamil character types + static final short C_U = 0; // unassigned + static final short C_C = 1; // consonant + static final short C_V = 2; // vowel + static final short C_M = 3; // vowel sign (matra) + static final short C_S = 4; // symbol or sign + static final short C_T = 5; // tone mark + static final short C_A = 6; // accent mark + static final short C_P = 7; // punctuation + static final short C_D = 8; // digit + static final short C_H = 9; // halant (virama) + static final short C_O = 10; // other signs + static final short C_N = 0x0100; // nukta(ized) + static final short C_R = 0x0200; // reph(ized) + static final short C_PRE = 0x0400; // pre-base + static final short C_POST = 0x1000; // post-base + static final short C_WRAP = C_PRE | C_POST; // wrap (two part) vowel + static final short C_M_TYPE = 0x00FF; // type mask + static final short C_M_FLAGS = 0x7F00; // flag mask + // tamil block range + static final int CCA_START = 0x0B80; // first code point mapped by cca + static final int CCA_END = 0x0C00; // last code point + 1 mapped by cca + // tamil character type lookups + static final short[] CCA = { + C_U, // 0x0B80 // + C_U, // 0x0B81 // + C_O, // 0x0B82 // ANUSVARA + C_O, // 0x0B83 // VISARGA + C_U, // 0x0B84 // + C_V, // 0x0B85 // A + C_V, // 0x0B86 // AA + C_V, // 0x0B87 // I + C_V, // 0x0B88 // II + C_V, // 0x0B89 // U + C_V, // 0x0B8A // UU + C_U, // 0x0B8B // + C_U, // 0x0B8C // + C_U, // 0x0B8D // + C_V, // 0x0B8E // E + C_V, // 0x0B8F // EE + C_V, // 0x0B90 // AI + C_U, // 0x0B91 // + C_V, // 0x0B92 // O + C_V, // 0x0B93 // OO + C_V, // 0x0B94 // AU + C_C, // 0x0B95 // KA + C_U, // 0x0B96 // + C_U, // 0x0B97 // + C_U, // 0x0B98 // + C_C, // 0x0B99 // NGA + C_C, // 0x0B9A // CA + C_U, // 0x0B9B // + C_C, // 0x0B9C // JA + C_U, // 0x0B9D // + C_C, // 0x0B9E // NYA + C_C, // 0x0B9F // TTA + C_U, // 0x0BA0 // + C_U, // 0x0BA1 // + C_U, // 0x0BA2 // + C_C, // 0x0BA3 // NNA + C_C, // 0x0BA4 // TA + C_U, // 0x0BA5 // + C_U, // 0x0BA6 // + C_U, // 0x0BA7 // + C_C, // 0x0BA8 // NA + C_C, // 0x0BA9 // NNNA + C_C, // 0x0BAA // PA + C_U, // 0x0BAB // + C_U, // 0x0BAC // + C_U, // 0x0BAD // + C_C, // 0x0BAE // MA + C_C, // 0x0BAF // YA + C_C | C_R, // 0x0BB0 // RA + C_C | C_R, // 0x0BB1 // RRA + C_C, // 0x0BB2 // LA + C_C, // 0x0BB3 // LLA + C_C, // 0x0BB4 // LLLA + C_C, // 0x0BB5 // VA + C_C, // 0x0BB6 // SHA + C_C, // 0x0BB7 // SSA + C_C, // 0x0BB8 // SA + C_C, // 0x0BB9 // HA + C_U, // 0x0BBA // + C_U, // 0x0BBB // + C_U, // 0x0BBC // + C_U, // 0x0BBD // + C_M, // 0x0BBE // AA + C_M, // 0x0BBF // I + C_M, // 0x0BC0 // II + C_M, // 0x0BC1 // U + C_M, // 0x0BC2 // UU + C_U, // 0x0BC3 // + C_U, // 0x0BC4 // + C_U, // 0x0BC5 // + C_M | C_PRE, // 0x0BC6 // E + C_M | C_PRE, // 0x0BC7 // EE + C_M | C_PRE, // 0x0BC8 // AI + C_U, // 0x0BC9 // + C_M | C_WRAP, // 0x0BCA // O + C_M | C_WRAP, // 0x0BCB // OO + C_M | C_WRAP, // 0x0BCC // AU + C_H, // 0x0BCD // VIRAMA (HALANT) + C_U, // 0x0BCE // + C_U, // 0x0BCF // + C_S, // 0x0BD0 // OM + C_U, // 0x0BD1 // + C_U, // 0x0BD2 // + C_U, // 0x0BD3 // + C_U, // 0x0BD4 // + C_U, // 0x0BD5 // + C_U, // 0x0BD6 // + C_M, // 0x0BD7 // AU LENGTH MARK + C_U, // 0x0BD8 // + C_U, // 0x0BD9 // + C_U, // 0x0BDA // + C_U, // 0x0BDB // + C_U, // 0x0BDC // + C_U, // 0x0BDD // + C_U, // 0x0BDE // + C_U, // 0x0BDF // + C_U, // 0x0BE0 // + C_U, // 0x0BE1 // + C_U, // 0x0BE2 // + C_U, // 0x0BE3 // + C_U, // 0x0BE4 // + C_U, // 0x0BE5 // + C_D, // 0x0BE6 // ZERO + C_D, // 0x0BE7 // ONE + C_D, // 0x0BE8 // TWO + C_D, // 0x0BE9 // THREE + C_D, // 0x0BEA // FOUR + C_D, // 0x0BEB // FIVE + C_D, // 0x0BEC // SIX + C_D, // 0x0BED // SEVEN + C_D, // 0x0BEE // EIGHT + C_D, // 0x0BEF // NINE + C_S, // 0x0BF0 // TEN + C_S, // 0x0BF1 // ONE HUNDRED + C_S, // 0x0BF2 // ONE THOUSAND + C_S, // 0x0BF3 // DAY SIGN (naal) + C_S, // 0x0BF4 // MONTH SIGN (maatham) + C_S, // 0x0BF5 // YEAR SIGN (varudam) + C_S, // 0x0BF6 // DEBIT SIGN (patru) + C_S, // 0x0BF7 // CREDIT SIGN (varavu) + C_S, // 0x0BF8 // AS ABOVE SIGN (merpadi) + C_S, // 0x0BF9 // RUPEE SIGN (rupai) + C_S, // 0x0BFA // NUMBER SIGN (enn) + C_U, // 0x0BFB // + C_U, // 0x0BFC // + C_U, // 0x0BFD // + C_U, // 0x0BFE // + C_U // 0x0BFF // + }; + static int typeOf(int c) { + if ((c >= CCA_START) && (c < CCA_END)) { + return CCA [ c - CCA_START ] & C_M_TYPE; + } else { + return C_U; + } + } + static boolean isType(int c, int t) { + return typeOf(c) == t; + } + static boolean hasFlag(int c, int f) { + if ((c >= CCA_START) && (c < CCA_END)) { + return (CCA [ c - CCA_START ] & f) == f; + } else { + return false; + } + } + static boolean isC(int c) { + return isType(c, C_C); + } + static boolean isR(int c) { + return isType(c, C_C) && hasR(c); + } + static boolean isV(int c) { + return isType(c, C_V); + } + static boolean isN(int c) { + return c == 0x093C; + } + static boolean isH(int c) { + return c == 0x094D; + } + static boolean isM(int c) { + return isType(c, C_M); + } + static boolean isPreM(int c) { + return isType(c, C_M) && hasFlag(c, C_PRE); + } + static boolean isX(int c) { + switch (typeOf(c)) { + case C_M: // matra (combining vowel) + case C_A: // accent mark + case C_T: // tone mark + case C_O: // other (modifying) mark + return true; + default: + return false; + } + } + static boolean hasR(int c) { + return hasFlag(c, C_R); + } + static boolean hasN(int c) { + return hasFlag(c, C_N); + } + +} diff --git a/src/java/org/apache/fop/complexscripts/util/CharNormalize.java b/src/java/org/apache/fop/complexscripts/util/CharNormalize.java new file mode 100644 index 000000000..a0c8e960b --- /dev/null +++ b/src/java/org/apache/fop/complexscripts/util/CharNormalize.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.util; + +import java.util.Arrays; + +/** + * <p>Normalization related utilities. N.B. This implementation is an experimental + * shortcut, the full version of which would require either using ICU4J or an extraction + * of its normalization function, either being a significant undertaking. At present + * we handle only specialized decomposition of Indic two part matras.</p> + * + * <p>This work was originally authored by Glenn Adams (gadams@apache.org).</p> + */ +public final class CharNormalize { + + // CSOFF: LineLength + + private CharNormalize() { + } + + private static final int[] DECOMPOSABLES = { + // bengali + 0x09CB, + 0x09CC, + // oriya + 0x0B4B, + 0x0B4C, + // tamil + 0x0BCA, + 0x0BCB, + 0x0BCC, + // malayalam + 0x0D4A, + 0x0D4B, + 0x0D4C, + // sinhala + 0x0DDA, + 0x0DDC, + 0x0DDD, + 0x0DDE, + }; + + private static final int[][] DECOMPOSITIONS = { + // bengali + { 0x09C7, 0x09BE }, // 0x09CB + { 0x09C7, 0x09D7 }, // 0x09CC + // oriya + { 0x0B47, 0x0B4E }, // 0x0B4B + { 0x0B47, 0x0B57 }, // 0x0B4C + // tamil + { 0x0BC6, 0x0BBE }, // 0x0BCA + { 0x0BC7, 0x0BBE }, // 0x0BCB + { 0x0BC6, 0x0BD7 }, // 0x0BCC + // malayalam + { 0x0D46, 0x0D3E }, // 0x0D4A + { 0x0D47, 0x0D3E }, // 0x0D4B + { 0x0D46, 0x0D57 }, // 0x0D4C + // sinhala + { 0x0DD9, 0x0DCA }, // 0x0DDA + { 0x0DD9, 0x0DCF }, // 0x0DDC + { 0x0DD9, 0x0DCF, 0x0DCA }, // 0x0DDD + { 0x0DD9, 0x0DDF }, // 0x0DDE + }; + + private static final int MAX_DECOMPOSITION_LENGTH = 3; + + public static boolean isDecomposable(int c) { + return Arrays.binarySearch(DECOMPOSABLES, c) >= 0; + } + + public static int maximumDecompositionLength() { + return MAX_DECOMPOSITION_LENGTH; + } + + public static int[] decompose(int c, int[] da) { + int di = Arrays.binarySearch(DECOMPOSABLES, c); + if (di >= 0) { + return DECOMPOSITIONS[di]; + } else if ((da != null) && (da.length > 1)) { + da[0] = c; + da[1] = 0; + return da; + } else { + return new int[] { c }; + } + } + +} diff --git a/src/java/org/apache/fop/fonts/MultiByteFont.java b/src/java/org/apache/fop/fonts/MultiByteFont.java index 5b2f62a26..d95958c19 100644 --- a/src/java/org/apache/fop/fonts/MultiByteFont.java +++ b/src/java/org/apache/fop/fonts/MultiByteFont.java @@ -38,6 +38,7 @@ import org.apache.fop.complexscripts.fonts.GlyphSubstitutionTable; import org.apache.fop.complexscripts.fonts.GlyphTable; import org.apache.fop.complexscripts.fonts.Positionable; import org.apache.fop.complexscripts.fonts.Substitutable; +import org.apache.fop.complexscripts.util.CharNormalize; import org.apache.fop.complexscripts.util.GlyphSequence; import org.apache.fop.util.CharUtilities; @@ -491,7 +492,8 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl /** {@inheritDoc} */ public CharSequence performSubstitution(CharSequence cs, String script, String language, List associations) { if (gsub != null) { - GlyphSequence igs = mapCharsToGlyphs(cs, associations); + CharSequence ncs = normalize(cs, associations); + GlyphSequence igs = mapCharsToGlyphs(ncs, associations); GlyphSequence ogs = gsub.substitute(igs, script, language); if (associations != null) { associations.clear(); @@ -509,7 +511,7 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl CharSequence cs, int[][] gpa, String script, String language, List associations) { if (gdef != null) { GlyphSequence igs = mapCharsToGlyphs(cs, associations); - GlyphSequence ogs = gdef.reorderCombiningMarks(igs, gpa, script, language); + GlyphSequence ogs = gdef.reorderCombiningMarks(igs, getUnscaledWidths(igs), gpa, script, language); if (associations != null) { associations.clear(); associations.addAll(ogs.getAssociations()); @@ -521,6 +523,16 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl } } + protected int[] getUnscaledWidths(GlyphSequence gs) { + int[] widths = new int[gs.getGlyphCount()]; + for (int i = 0, n = widths.length; i < n; ++i) { + if (i < width.length) { + widths[i] = width[i]; + } + } + return widths; + } + /** {@inheritDoc} */ public boolean performsPositioning() { return gpos != null; @@ -652,6 +664,37 @@ public class MultiByteFont extends CIDFont implements Substitutable, Positionabl return cb; } + private CharSequence normalize(CharSequence cs, List associations) { + return hasDecomposable(cs) ? decompose(cs, associations) : cs; + } + + private boolean hasDecomposable(CharSequence cs) { + for (int i = 0, n = cs.length(); i < n; i++) { + int cc = cs.charAt(i); + if (CharNormalize.isDecomposable(cc)) { + return true; + } + } + return false; + } + + private CharSequence decompose(CharSequence cs, List associations) { + StringBuffer sb = new StringBuffer(cs.length()); + int[] daBuffer = new int[CharNormalize.maximumDecompositionLength()]; + for (int i = 0, n = cs.length(); i < n; i++) { + int cc = cs.charAt(i); + int[] da = CharNormalize.decompose(cc, daBuffer); + for (int j = 0; j < da.length; j++) { + if (da[j] > 0) { + sb.append((char) da[j]); + } else { + break; + } + } + } + return sb; + } + @Override public boolean hasFeature(int tableType, String script, String language, String feature) { GlyphTable table; |