From c54cb7efe6d808e67882930e828e3bd89ee976b9 Mon Sep 17 00:00:00 2001 From: Glenn Adams Date: Mon, 13 Oct 2014 20:57:22 +0000 Subject: [PATCH] FOP-2416: add support for Arabic Joiners (ZWJ/ZWNJ) - preliminary git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@1631546 13f79535-47bb-0310-9956-ffa450edef68 --- .../scripts/ArabicScriptProcessor.java | 167 ++++++++++++------ .../scripts/ScriptsTestSuite.java | 8 +- .../scripts/arabic/ArabicJoinersTestCase.java | 138 +++++++++++++++ ...nts.java => ArabicWordFormsConstants.java} | 2 +- ...Case.java => ArabicWordFormsTestCase.java} | 2 +- .../arabic/GenerateArabicTestData.java | 2 +- 6 files changed, 259 insertions(+), 60 deletions(-) create mode 100644 test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicJoinersTestCase.java rename test/java/org/apache/fop/complexscripts/scripts/arabic/{ArabicTestConstants.java => ArabicWordFormsConstants.java} (97%) rename test/java/org/apache/fop/complexscripts/scripts/arabic/{ArabicTestCase.java => ArabicWordFormsTestCase.java} (98%) diff --git a/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java b/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java index b108c5ebe..6ffd8c5ef 100644 --- a/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java +++ b/src/java/org/apache/fop/complexscripts/scripts/ArabicScriptProcessor.java @@ -33,6 +33,7 @@ import org.apache.fop.complexscripts.util.CharAssociation; import org.apache.fop.complexscripts.util.GlyphContextTester; import org.apache.fop.complexscripts.util.GlyphSequence; import org.apache.fop.complexscripts.util.ScriptContextTester; +import org.apache.fop.util.CharUtilities; // CSOFF: LineLengthCheck @@ -158,9 +159,11 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { int e = a.getEnd(); if (!hasFinalPrecedingContext(ca, nc, s, e)) { return false; - } else if (forcesFinalThisContext(ca, nc, s, e)) { + } else if (!hasFinalThisContext(ca, nc, s, e)) { + return false; + } else if (forceFinalThisContext(ca, nc, s, e)) { return true; - } else if (!hasFinalFollowingContext(ca, nc, s, e)) { + } else if (!hasFinalSucceedingContext(ca, nc, s, e)) { return false; } else { return true; @@ -179,7 +182,9 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { int e = a.getEnd(); if (!hasInitialPrecedingContext(ca, nc, s, e)) { return false; - } else if (!hasInitialFollowingContext(ca, nc, s, e)) { + } else if (!hasInitialThisContext(ca, nc, s, e)) { + return false; + } else if (!hasInitialSucceedingContext(ca, nc, s, e)) { return false; } else { return true; @@ -210,7 +215,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { int e = a.getEnd(); if (!hasLigaturePrecedingContext(ca, nc, s, e)) { return false; - } else if (!hasLigatureFollowingContext(ca, nc, s, e)) { + } else if (!hasLigatureSucceedingContext(ca, nc, s, e)) { return false; } else { return true; @@ -231,7 +236,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { return false; } else if (!hasMedialThisContext(ca, nc, s, e)) { return false; - } else if (!hasMedialFollowingContext(ca, nc, s, e)) { + } else if (!hasMedialSucceedingContext(ca, nc, s, e)) { return false; } else { return true; @@ -240,7 +245,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } private static boolean hasFinalPrecedingContext(int[] ca, int nc, int s, int e) { - int chp = 0; + int chp = 0; // preceding non-NSM char in [0,s) searching back from s int clp = 0; for (int i = s; i > 0; i--) { int k = i - 1; @@ -253,7 +258,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } } if (clp != BidiConstants.AL) { - return false; + return isZWJ(chp); } else if (hasIsolateInitial(chp)) { return false; } else { @@ -261,8 +266,8 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } } - private static boolean forcesFinalThisContext(int[] ca, int nc, int s, int e) { - int chl = 0; + private static boolean hasFinalThisContext(int[] ca, int nc, int s, int e) { + int chl = 0; // last non-{NSM,ZWJ} char in [s,e) int cll = 0; for (int i = 0, n = e - s; i < n; i++) { int k = n - i - 1; @@ -270,7 +275,31 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { if ((j >= 0) && (j < nc)) { chl = ca [ j ]; cll = BidiClass.getBidiClass(chl); - if (cll != BidiConstants.NSM) { + if ((cll != BidiConstants.NSM) && !isZWJ(chl)) { + break; + } + } + } + if (cll != BidiConstants.AL) { + return false; + } + if (hasIsolateFinal(chl)) { + return false; + } else { + return true; + } + } + + private static boolean forceFinalThisContext(int[] ca, int nc, int s, int e) { + int chl = 0; // last non-{NSM,ZWJ} char in [s,e) + int cll = 0; + for (int i = 0, n = e - s; i < n; i++) { + int k = n - i - 1; + int j = s + k; + if ((j >= 0) && (j < nc)) { + chl = ca [ j ]; + cll = BidiClass.getBidiClass(chl); + if ((cll != BidiConstants.NSM) && !isZWJ(chl)) { break; } } @@ -285,19 +314,19 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } } - private static boolean hasFinalFollowingContext(int[] ca, int nc, int s, int e) { - int chf = 0; - int clf = 0; + private static boolean hasFinalSucceedingContext(int[] ca, int nc, int s, int e) { + int chs = 0; // succeeding non-NSM char in [e,nc) searching forward from e + int cls = 0; for (int i = e, n = nc; i < n; i++) { - chf = ca [ i ]; - clf = BidiClass.getBidiClass(chf); - if (clf != BidiConstants.NSM) { + chs = ca [ i ]; + cls = BidiClass.getBidiClass(chs); + if (cls != BidiConstants.NSM) { break; } } - if (clf != BidiConstants.AL) { - return true; - } else if (hasIsolateFinal(chf)) { + if (cls != BidiConstants.AL) { + return !isZWJ(chs); + } else if (hasIsolateFinal(chs)) { return true; } else { return false; @@ -305,7 +334,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } private static boolean hasInitialPrecedingContext(int[] ca, int nc, int s, int e) { - int chp = 0; + int chp = 0; // preceding non-NSM char in [0,s) searching back from s int clp = 0; for (int i = s; i > 0; i--) { int k = i - 1; @@ -318,7 +347,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } } if (clp != BidiConstants.AL) { - return true; + return !isZWJ(chp); } else if (hasIsolateInitial(chp)) { return true; } else { @@ -326,19 +355,42 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } } - private static boolean hasInitialFollowingContext(int[] ca, int nc, int s, int e) { - int chf = 0; + private static boolean hasInitialThisContext(int[] ca, int nc, int s, int e) { + int chf = 0; // first non-{NSM,ZWJ} char in [s,e) int clf = 0; - for (int i = e, n = nc; i < n; i++) { - chf = ca [ i ]; - clf = BidiClass.getBidiClass(chf); - if (clf != BidiConstants.NSM) { - break; + for (int i = 0, n = e - s; i < n; i++) { + int k = s + i; + if ((k >= 0) && (k < nc)) { + chf = ca [ s + i ]; + clf = BidiClass.getBidiClass(chf); + if ((clf != BidiConstants.NSM) && !isZWJ(chf)) { + break; + } } } if (clf != BidiConstants.AL) { return false; - } else if (hasIsolateFinal(chf)) { + } + if (hasIsolateInitial(chf)) { + return false; + } else { + return true; + } + } + + private static boolean hasInitialSucceedingContext(int[] ca, int nc, int s, int e) { + int chs = 0; // succeeding non-NSM char in [e,nc) searching forward from e + int cls = 0; + for (int i = e, n = nc; i < n; i++) { + chs = ca [ i ]; + cls = BidiClass.getBidiClass(chs); + if (cls != BidiConstants.NSM) { + break; + } + } + if (cls != BidiConstants.AL) { + return isZWJ(chs); + } else if (hasIsolateFinal(chs)) { return false; } else { return true; @@ -346,7 +398,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } private static boolean hasMedialPrecedingContext(int[] ca, int nc, int s, int e) { - int chp = 0; + int chp = 0; // preceding non-NSM char in [0,s) searching back from s int clp = 0; for (int i = s; i > 0; i--) { int k = i - 1; @@ -359,7 +411,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } } if (clp != BidiConstants.AL) { - return false; + return isZWJ(chp); } else if (hasIsolateInitial(chp)) { return false; } else { @@ -368,14 +420,14 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } private static boolean hasMedialThisContext(int[] ca, int nc, int s, int e) { - int chf = 0; // first non-NSM char in [s,e) + int chf = 0; // first non-{NSM,ZWJ} char in [s,e) int clf = 0; for (int i = 0, n = e - s; i < n; i++) { int k = s + i; if ((k >= 0) && (k < nc)) { chf = ca [ s + i ]; clf = BidiClass.getBidiClass(chf); - if (clf != BidiConstants.NSM) { + if ((clf != BidiConstants.NSM) && !isZWJ(chf)) { break; } } @@ -383,7 +435,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { if (clf != BidiConstants.AL) { return false; } - int chl = 0; // last non-NSM char in [s,e) + int chl = 0; // last non-{NSM,ZWJ} char in [s,e) int cll = 0; for (int i = 0, n = e - s; i < n; i++) { int k = n - i - 1; @@ -391,7 +443,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { if ((j >= 0) && (j < nc)) { chl = ca [ j ]; cll = BidiClass.getBidiClass(chl); - if (cll != BidiConstants.NSM) { + if ((cll != BidiConstants.NSM) && !isZWJ(chl)) { break; } } @@ -408,19 +460,19 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { } } - private static boolean hasMedialFollowingContext(int[] ca, int nc, int s, int e) { - int chf = 0; - int clf = 0; + private static boolean hasMedialSucceedingContext(int[] ca, int nc, int s, int e) { + int chs = 0; // succeeding non-NSM char in [e,nc) searching forward from e + int cls = 0; for (int i = e, n = nc; i < n; i++) { - chf = ca [ i ]; - clf = BidiClass.getBidiClass(chf); - if (clf != BidiConstants.NSM) { + chs = ca [ i ]; + cls = BidiClass.getBidiClass(chs); + if (cls != BidiConstants.NSM) { break; } } - if (clf != BidiConstants.AL) { - return false; - } else if (hasIsolateFinal(chf)) { + if (cls != BidiConstants.AL) { + return isZWJ(chs); + } else if (hasIsolateFinal(chs)) { return false; } else { return true; @@ -431,17 +483,18 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { return true; } - private static boolean hasLigatureFollowingContext(int[] ca, int nc, int s, int e) { - int chf = 0; - int clf = 0; + private static boolean hasLigatureSucceedingContext(int[] ca, int nc, int s, int e) { + int chs = 0; // succeeding non-NSM char in [e,nc) searching forward from e + int cls = 0; for (int i = e, n = nc; i < n; i++) { - chf = ca [ i ]; - clf = BidiClass.getBidiClass(chf); - if (clf != BidiConstants.NSM) { + chs = ca [ i ]; + cls = BidiClass.getBidiClass(chs); + // TBD - does ZWJ have impact here? + if (cls != BidiConstants.NSM) { break; } } - if (clf == BidiConstants.AL) { + if (cls == BidiConstants.AL) { return true; } else { return false; @@ -452,7 +505,7 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { * Ordered array of Unicode scalars designating those Arabic (Script) Letters * which exhibit an isolated form in word initial position. */ - private static int[] isolatedInitials = { + private static final int[] ISOLATED_INITIALS = { 0x0621, // HAMZA 0x0622, // ALEF WITH MADDA ABOVE 0x0623, // ALEF WITH HAMZA ABOVE @@ -502,19 +555,23 @@ public class ArabicScriptProcessor extends DefaultScriptProcessor { }; private static boolean hasIsolateInitial(int ch) { - return Arrays.binarySearch(isolatedInitials, ch) >= 0; + return Arrays.binarySearch(ISOLATED_INITIALS, ch) >= 0; } /** * Ordered array of Unicode scalars designating those Arabic (Script) Letters * which exhibit an isolated form in word final position. */ - private static int[] isolatedFinals = { + private static final int[] ISOLATED_FINALS = { 0x0621 // HAMZA }; private static boolean hasIsolateFinal(int ch) { - return Arrays.binarySearch(isolatedFinals, ch) >= 0; + return Arrays.binarySearch(ISOLATED_FINALS, ch) >= 0; + } + + private static boolean isZWJ(int ch) { + return ch == CharUtilities.ZERO_WIDTH_JOINER; } } diff --git a/test/java/org/apache/fop/complexscripts/scripts/ScriptsTestSuite.java b/test/java/org/apache/fop/complexscripts/scripts/ScriptsTestSuite.java index c48be5220..27914450a 100644 --- a/test/java/org/apache/fop/complexscripts/scripts/ScriptsTestSuite.java +++ b/test/java/org/apache/fop/complexscripts/scripts/ScriptsTestSuite.java @@ -23,12 +23,16 @@ import org.junit.runner.RunWith; import org.junit.runners.Suite; import org.junit.runners.Suite.SuiteClasses; -import org.apache.fop.complexscripts.scripts.arabic.ArabicTestCase; +import org.apache.fop.complexscripts.scripts.arabic.ArabicJoinersTestCase; +import org.apache.fop.complexscripts.scripts.arabic.ArabicWordFormsTestCase; /** * Test suite for script specific functionality related to complex scripts. */ @RunWith(Suite.class) -@SuiteClasses(ArabicTestCase.class) +@SuiteClasses({ + ArabicJoinersTestCase.class, + ArabicWordFormsTestCase.class +}) public class ScriptsTestSuite { } diff --git a/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicJoinersTestCase.java b/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicJoinersTestCase.java new file mode 100644 index 000000000..092bebd41 --- /dev/null +++ b/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicJoinersTestCase.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.complexscripts.scripts.arabic; + +import java.nio.IntBuffer; +import java.util.BitSet; + +import org.junit.Test; + +import static org.junit.Assert.assertTrue; + +import org.apache.fop.complexscripts.scripts.ScriptProcessor; +import org.apache.fop.complexscripts.util.CharScript; +import org.apache.fop.complexscripts.util.GlyphContextTester; +import org.apache.fop.complexscripts.util.GlyphSequence; +import org.apache.fop.complexscripts.util.ScriptContextTester; +import org.apache.fop.complexscripts.util.UTF32; + +// CSOFF: LineLength + +/** + * Tests for joiner (ZWJ, ZWNJ) functionality related to the arabic script. + */ +public class ArabicJoinersTestCase { + + private static final String[][] ZWJ_TESTS_ISOL = new String[][] { + { "\u0643", "1", }, + { "\u0643\u200D", "00", }, + { "\u200D\u0643", "00", }, + { "\u200D\u0643\u200D", "000", }, + }; + + private static final String[][] ZWJ_TESTS_INIT = new String[][] { + { "\u0643", "0", }, + { "\u0643\u200D", "10", }, + { "\u200D\u0643", "00", }, + { "\u200D\u0643\u200D", "000", }, + }; + + private static final String[][] ZWJ_TESTS_MEDI = new String[][] { + { "\u0643", "0", }, + { "\u0643\u200D", "00", }, + { "\u200D\u0643", "00", }, + { "\u200D\u0643\u200D", "010", }, + }; + + private static final String[][] ZWJ_TESTS_FINA = new String[][] { + { "\u0643", "0", }, + { "\u0643\u200D", "00", }, + { "\u200D\u0643", "01", }, + { "\u200D\u0643\u200D", "000", }, + }; + + private static final String[][] ZWJ_TESTS_LIGA = new String[][] { + }; + + @Test + public void testArabicJoiners() { + String script = CharScript.scriptTagFromCode(CharScript.SCRIPT_ARABIC); + ScriptProcessor sp = ScriptProcessor.getInstance(script); + assertTrue(sp != null); + ScriptContextTester sct = sp.getSubstitutionContextTester(); + assertTrue(sct != null); + String language = "dflt"; + int flags = 0; + testZWJ(sct, script, language, "isol", flags, ZWJ_TESTS_ISOL); + testZWJ(sct, script, language, "init", flags, ZWJ_TESTS_INIT); + testZWJ(sct, script, language, "medi", flags, ZWJ_TESTS_MEDI); + testZWJ(sct, script, language, "fina", flags, ZWJ_TESTS_FINA); + testZWJ(sct, script, language, "liga", flags, ZWJ_TESTS_LIGA); + } + + private void testZWJ(ScriptContextTester sct, String script, String language, String feature, int flags, String[][] tests) { + GlyphContextTester gct = sct.getTester(feature); + assertTrue(gct != null); + for (String[] t : tests) { + testZWJ(gct, script, language, feature, flags, t); + } + } + + private void testZWJ(GlyphContextTester gct, String script, String language, String feature, int flags, String[] test) { + assert test.length == 2; + String str = test[0]; + BitSet act = new BitSet(); + GlyphSequence gs = makeGlyphSequence(str); + for (int i = 0, n = str.length(); i < n; ++i) { + if (gct.test(script, language, feature, gs, i, flags)) { + act.set(i); + } + } + BitSet exp = parseBitSet(test[1]); + assertTrue(act.equals(exp)); + } + + private GlyphSequence makeGlyphSequence(String s) { + Integer[] ca = UTF32.toUTF32(s, 0, true); + IntBuffer cb = IntBuffer.allocate(ca.length); + for (Integer c : ca) { + cb.put(c); + } + cb.rewind(); + return new GlyphSequence(cb, null, null); + } + + private BitSet parseBitSet(String s) { + BitSet bits = new BitSet(); + for (int i = 0, n = s.length(); i < n; ++i) { + char c = s.charAt(i); + assert (c == '0') || (c == '1'); + if (c == '1') { + bits.set(i); + } + } + return bits; + } + + @Test + public void testArabicNonJoiners() { + } + +} diff --git a/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicTestConstants.java b/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicWordFormsConstants.java similarity index 97% rename from test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicTestConstants.java rename to test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicWordFormsConstants.java index cc9167553..41094b687 100644 --- a/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicTestConstants.java +++ b/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicWordFormsConstants.java @@ -22,7 +22,7 @@ package org.apache.fop.complexscripts.scripts.arabic; /** * Constants for test functionality related to the arabic script. */ -public interface ArabicTestConstants { +public interface ArabicWordFormsConstants { String WF_FILE_SCRIPT = "arab"; String WF_FILE_LANGUAGE = "dflt"; diff --git a/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicTestCase.java b/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicWordFormsTestCase.java similarity index 98% rename from test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicTestCase.java rename to test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicWordFormsTestCase.java index 8542faae9..abe4f513a 100644 --- a/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicTestCase.java +++ b/test/java/org/apache/fop/complexscripts/scripts/arabic/ArabicWordFormsTestCase.java @@ -44,7 +44,7 @@ import org.apache.fop.complexscripts.util.GlyphSequence; /** * Tests for functionality related to the arabic script. */ -public class ArabicTestCase implements ArabicTestConstants { +public class ArabicWordFormsTestCase implements ArabicWordFormsConstants { @Test public void testArabicWordForms() { diff --git a/test/java/org/apache/fop/complexscripts/scripts/arabic/GenerateArabicTestData.java b/test/java/org/apache/fop/complexscripts/scripts/arabic/GenerateArabicTestData.java index 9ca16a164..35bfb3128 100644 --- a/test/java/org/apache/fop/complexscripts/scripts/arabic/GenerateArabicTestData.java +++ b/test/java/org/apache/fop/complexscripts/scripts/arabic/GenerateArabicTestData.java @@ -40,7 +40,7 @@ import org.apache.fop.complexscripts.util.GlyphSequence; /** * Tests for functionality related to the arabic script. */ -public final class GenerateArabicTestData implements ArabicTestConstants { +public final class GenerateArabicTestData implements ArabicWordFormsConstants { private GenerateArabicTestData() { } -- 2.39.5