diff options
author | Manuel Mall <manuel@apache.org> | 2006-12-22 09:16:18 +0000 |
---|---|---|
committer | Manuel Mall <manuel@apache.org> | 2006-12-22 09:16:18 +0000 |
commit | c78f7767b3686e851ede4c41d6747fcecc539e83 (patch) | |
tree | b5b0de4af7b90930883ee8055f0681a65b1df7c4 /src/codegen | |
parent | 3c0a84fd754d2a0b981ea1f0d06ae3046d36da4d (diff) | |
download | xmlgraphics-fop-c78f7767b3686e851ede4c41d6747fcecc539e83.tar.gz xmlgraphics-fop-c78f7767b3686e851ede4c41d6747fcecc539e83.zip |
Added (limited) support for Unicode UAX#14 compliant line breaking. Thanks to Joerg Pietschman who supplied the core code for the Unicode line breaking algorithm
git-svn-id: https://svn.apache.org/repos/asf/xmlgraphics/fop/trunk@489585 13f79535-47bb-0310-9956-ffa450edef68
Diffstat (limited to 'src/codegen')
-rw-r--r-- | src/codegen/fo/colorkw.xml (renamed from src/codegen/colorkw.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fo/constants.xml (renamed from src/codegen/constants.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fo/constants.xsl (renamed from src/codegen/constants.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fo/fo-property-mapping.xsl (renamed from src/codegen/fo-property-mapping.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fo/foelements.xml (renamed from src/codegen/foelements.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fo/foproperties.xml (renamed from src/codegen/foproperties.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fo/properties.dtd (renamed from src/codegen/properties.dtd) | 0 | ||||
-rw-r--r-- | src/codegen/fo/property-sets.xsl (renamed from src/codegen/property-sets.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fo/propinc.xsl (renamed from src/codegen/propinc.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fo/propmaker.xsl (renamed from src/codegen/propmaker.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/Courier.xml (renamed from src/codegen/Courier.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/CourierBold.xml (renamed from src/codegen/CourierBold.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/CourierBoldOblique.xml (renamed from src/codegen/CourierBoldOblique.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/CourierOblique.xml (renamed from src/codegen/CourierOblique.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/Helvetica.xml (renamed from src/codegen/Helvetica.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/HelveticaBold.xml (renamed from src/codegen/HelveticaBold.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/HelveticaBoldOblique.xml (renamed from src/codegen/HelveticaBoldOblique.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/HelveticaOblique.xml (renamed from src/codegen/HelveticaOblique.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/Symbol.xml (renamed from src/codegen/Symbol.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/TimesBold.xml (renamed from src/codegen/TimesBold.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/TimesBoldItalic.xml (renamed from src/codegen/TimesBoldItalic.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/TimesItalic.xml (renamed from src/codegen/TimesItalic.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/TimesRoman.xml (renamed from src/codegen/TimesRoman.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/ZapfDingbats.xml (renamed from src/codegen/ZapfDingbats.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/charlist.xml (renamed from src/codegen/charlist.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/code-point-mapping.xsl (renamed from src/codegen/code-point-mapping.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/encodings.xml (renamed from src/codegen/encodings.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/font-file.xsl (renamed from src/codegen/font-file.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/glyphlist.xml (renamed from src/codegen/glyphlist.xml) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/t1font-file.xsl (renamed from src/codegen/t1font-file.xsl) | 0 | ||||
-rw-r--r-- | src/codegen/fonts/ttffontfile.xsl (renamed from src/codegen/ttffontfile.xsl) | 0 | ||||
-rwxr-xr-x | src/codegen/unicode/data/LineBreakPairTable.txt | 28 | ||||
-rw-r--r-- | src/codegen/unicode/java/org/apache/fop/text/linebreak/GenerateLineBreakUtils.java | 659 |
33 files changed, 687 insertions, 0 deletions
diff --git a/src/codegen/colorkw.xml b/src/codegen/fo/colorkw.xml index 0a0c2717d..0a0c2717d 100644 --- a/src/codegen/colorkw.xml +++ b/src/codegen/fo/colorkw.xml diff --git a/src/codegen/constants.xml b/src/codegen/fo/constants.xml index d477506bb..d477506bb 100644 --- a/src/codegen/constants.xml +++ b/src/codegen/fo/constants.xml diff --git a/src/codegen/constants.xsl b/src/codegen/fo/constants.xsl index af51489af..af51489af 100644 --- a/src/codegen/constants.xsl +++ b/src/codegen/fo/constants.xsl diff --git a/src/codegen/fo-property-mapping.xsl b/src/codegen/fo/fo-property-mapping.xsl index 408144a36..408144a36 100644 --- a/src/codegen/fo-property-mapping.xsl +++ b/src/codegen/fo/fo-property-mapping.xsl diff --git a/src/codegen/foelements.xml b/src/codegen/fo/foelements.xml index db1360aab..db1360aab 100644 --- a/src/codegen/foelements.xml +++ b/src/codegen/fo/foelements.xml diff --git a/src/codegen/foproperties.xml b/src/codegen/fo/foproperties.xml index f4a19eb27..f4a19eb27 100644 --- a/src/codegen/foproperties.xml +++ b/src/codegen/fo/foproperties.xml diff --git a/src/codegen/properties.dtd b/src/codegen/fo/properties.dtd index 8ae36acb0..8ae36acb0 100644 --- a/src/codegen/properties.dtd +++ b/src/codegen/fo/properties.dtd diff --git a/src/codegen/property-sets.xsl b/src/codegen/fo/property-sets.xsl index 761a2cd23..761a2cd23 100644 --- a/src/codegen/property-sets.xsl +++ b/src/codegen/fo/property-sets.xsl diff --git a/src/codegen/propinc.xsl b/src/codegen/fo/propinc.xsl index 9f9adf9ef..9f9adf9ef 100644 --- a/src/codegen/propinc.xsl +++ b/src/codegen/fo/propinc.xsl diff --git a/src/codegen/propmaker.xsl b/src/codegen/fo/propmaker.xsl index 21cde4c53..21cde4c53 100644 --- a/src/codegen/propmaker.xsl +++ b/src/codegen/fo/propmaker.xsl diff --git a/src/codegen/Courier.xml b/src/codegen/fonts/Courier.xml index 89c7314db..89c7314db 100644 --- a/src/codegen/Courier.xml +++ b/src/codegen/fonts/Courier.xml diff --git a/src/codegen/CourierBold.xml b/src/codegen/fonts/CourierBold.xml index 92a777a50..92a777a50 100644 --- a/src/codegen/CourierBold.xml +++ b/src/codegen/fonts/CourierBold.xml diff --git a/src/codegen/CourierBoldOblique.xml b/src/codegen/fonts/CourierBoldOblique.xml index 914fdab84..914fdab84 100644 --- a/src/codegen/CourierBoldOblique.xml +++ b/src/codegen/fonts/CourierBoldOblique.xml diff --git a/src/codegen/CourierOblique.xml b/src/codegen/fonts/CourierOblique.xml index 3b043c17c..3b043c17c 100644 --- a/src/codegen/CourierOblique.xml +++ b/src/codegen/fonts/CourierOblique.xml diff --git a/src/codegen/Helvetica.xml b/src/codegen/fonts/Helvetica.xml index d63eb5a11..d63eb5a11 100644 --- a/src/codegen/Helvetica.xml +++ b/src/codegen/fonts/Helvetica.xml diff --git a/src/codegen/HelveticaBold.xml b/src/codegen/fonts/HelveticaBold.xml index c417937b4..c417937b4 100644 --- a/src/codegen/HelveticaBold.xml +++ b/src/codegen/fonts/HelveticaBold.xml diff --git a/src/codegen/HelveticaBoldOblique.xml b/src/codegen/fonts/HelveticaBoldOblique.xml index 087b225e4..087b225e4 100644 --- a/src/codegen/HelveticaBoldOblique.xml +++ b/src/codegen/fonts/HelveticaBoldOblique.xml diff --git a/src/codegen/HelveticaOblique.xml b/src/codegen/fonts/HelveticaOblique.xml index d913b6d51..d913b6d51 100644 --- a/src/codegen/HelveticaOblique.xml +++ b/src/codegen/fonts/HelveticaOblique.xml diff --git a/src/codegen/Symbol.xml b/src/codegen/fonts/Symbol.xml index 241d4d2c6..241d4d2c6 100644 --- a/src/codegen/Symbol.xml +++ b/src/codegen/fonts/Symbol.xml diff --git a/src/codegen/TimesBold.xml b/src/codegen/fonts/TimesBold.xml index 12bb17580..12bb17580 100644 --- a/src/codegen/TimesBold.xml +++ b/src/codegen/fonts/TimesBold.xml diff --git a/src/codegen/TimesBoldItalic.xml b/src/codegen/fonts/TimesBoldItalic.xml index 540e891e1..540e891e1 100644 --- a/src/codegen/TimesBoldItalic.xml +++ b/src/codegen/fonts/TimesBoldItalic.xml diff --git a/src/codegen/TimesItalic.xml b/src/codegen/fonts/TimesItalic.xml index 4868aed05..4868aed05 100644 --- a/src/codegen/TimesItalic.xml +++ b/src/codegen/fonts/TimesItalic.xml diff --git a/src/codegen/TimesRoman.xml b/src/codegen/fonts/TimesRoman.xml index 1f21290de..1f21290de 100644 --- a/src/codegen/TimesRoman.xml +++ b/src/codegen/fonts/TimesRoman.xml diff --git a/src/codegen/ZapfDingbats.xml b/src/codegen/fonts/ZapfDingbats.xml index 0420908a0..0420908a0 100644 --- a/src/codegen/ZapfDingbats.xml +++ b/src/codegen/fonts/ZapfDingbats.xml diff --git a/src/codegen/charlist.xml b/src/codegen/fonts/charlist.xml index 110714af6..110714af6 100644 --- a/src/codegen/charlist.xml +++ b/src/codegen/fonts/charlist.xml diff --git a/src/codegen/code-point-mapping.xsl b/src/codegen/fonts/code-point-mapping.xsl index 7d0d6cd71..7d0d6cd71 100644 --- a/src/codegen/code-point-mapping.xsl +++ b/src/codegen/fonts/code-point-mapping.xsl diff --git a/src/codegen/encodings.xml b/src/codegen/fonts/encodings.xml index 85aabb21f..85aabb21f 100644 --- a/src/codegen/encodings.xml +++ b/src/codegen/fonts/encodings.xml diff --git a/src/codegen/font-file.xsl b/src/codegen/fonts/font-file.xsl index 72ee81a68..72ee81a68 100644 --- a/src/codegen/font-file.xsl +++ b/src/codegen/fonts/font-file.xsl diff --git a/src/codegen/glyphlist.xml b/src/codegen/fonts/glyphlist.xml index aeb56e2b6..aeb56e2b6 100644 --- a/src/codegen/glyphlist.xml +++ b/src/codegen/fonts/glyphlist.xml diff --git a/src/codegen/t1font-file.xsl b/src/codegen/fonts/t1font-file.xsl index f5c285097..f5c285097 100644 --- a/src/codegen/t1font-file.xsl +++ b/src/codegen/fonts/t1font-file.xsl diff --git a/src/codegen/ttffontfile.xsl b/src/codegen/fonts/ttffontfile.xsl index 7231010bf..7231010bf 100644 --- a/src/codegen/ttffontfile.xsl +++ b/src/codegen/fonts/ttffontfile.xsl diff --git a/src/codegen/unicode/data/LineBreakPairTable.txt b/src/codegen/unicode/data/LineBreakPairTable.txt new file mode 100755 index 000000000..93388e1bd --- /dev/null +++ b/src/codegen/unicode/data/LineBreakPairTable.txt @@ -0,0 +1,28 @@ + OP CL QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT +OP ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ @ ^ ^ ^ ^ ^ ^ +CL _ ^ % % ^ ^ ^ ^ % % % % _ _ % % _ _ ^ # ^ _ _ _ _ _ +QU ^ ^ % % % ^ ^ ^ % % % % % % % % % % ^ # ^ % % % % % +GL % ^ % % % ^ ^ ^ % % % % % % % % % % ^ # ^ % % % % % +NS _ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ +EX _ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ +SY _ ^ % % % ^ ^ ^ _ _ % _ _ _ % % _ _ ^ # ^ _ _ _ _ _ +IS _ ^ % % % ^ ^ ^ _ _ % % _ _ % % _ _ ^ # ^ _ _ _ _ _ +PR % ^ % % % ^ ^ ^ _ _ % % % _ % % _ _ ^ # ^ % % % % % +PO % ^ % % % ^ ^ ^ _ _ % % _ _ % % _ _ ^ # ^ _ _ _ _ _ +NU % ^ % % % ^ ^ ^ % % % % _ % % % _ _ ^ # ^ _ _ _ _ _ +AL % ^ % % % ^ ^ ^ _ _ % % _ % % % _ _ ^ # ^ _ _ _ _ _ +ID _ ^ % % % ^ ^ ^ _ % _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ +IN _ ^ % % % ^ ^ ^ _ _ _ _ _ % % % _ _ ^ # ^ _ _ _ _ _ +HY _ ^ % % % ^ ^ ^ _ _ % _ _ _ % % _ _ ^ # ^ _ _ _ _ _ +BA _ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % _ _ ^ # ^ _ _ _ _ _ +BB % ^ % % % ^ ^ ^ % % % % % % % % % % ^ # ^ % % % % % +B2 _ ^ % % % ^ ^ ^ _ _ _ _ _ _ % % _ ^ ^ # ^ _ _ _ _ _ +ZW _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ ^ _ _ _ _ _ _ _ +CM _ ^ % % % ^ ^ ^ _ _ % % _ % % % _ _ ^ # ^ _ _ _ _ _ +WJ % ^ % % % ^ ^ ^ % % % % % % % % % % ^ # ^ % % % % % +H2 _ ^ % % % ^ ^ ^ _ % _ _ _ % % % _ _ ^ # ^ _ _ _ % % +H3 _ ^ % % % ^ ^ ^ _ % _ _ _ % % % _ _ ^ # ^ _ _ _ _ % +JL _ ^ % % % ^ ^ ^ _ % _ _ _ % % % _ _ ^ # ^ % % % % _ +JV _ ^ % % % ^ ^ ^ _ % _ _ _ % % % _ _ ^ # ^ _ _ _ % % +JT _ ^ % % % ^ ^ ^ _ % _ _ _ % % % _ _ ^ # ^ _ _ _ _ % + diff --git a/src/codegen/unicode/java/org/apache/fop/text/linebreak/GenerateLineBreakUtils.java b/src/codegen/unicode/java/org/apache/fop/text/linebreak/GenerateLineBreakUtils.java new file mode 100644 index 000000000..b956cabff --- /dev/null +++ b/src/codegen/unicode/java/org/apache/fop/text/linebreak/GenerateLineBreakUtils.java @@ -0,0 +1,659 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id$ */ + +package org.apache.fop.text.linebreak; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.StringTokenizer; + +/** + * <p>Utility for generating a Java class representing line break properties + * from the Unicode property files.</p> + * <p>Customizations: + * <ul> + * <li>The pair table file is a cut+paste of the sample table from the TR14 + * HTML file into a text file.</li> + * <li>Because the sample table does not cover all line break classes, check the + * 'not in pair table' list of property value short names.</li> + * <li>Check MAX_LINE_LENGTH.</li> + * </ul> + * + */ +public class GenerateLineBreakUtils { + + private static final int MAX_LINE_LENGTH = 110; + + private static final byte DIRECT_BREAK = 0; // _ in table + private static final byte INDIRECT_BREAK = 1; // % in table + private static final byte COMBINING_INDIRECT_BREAK = 2; // # in table + private static final byte COMBINING_PROHIBITED_BREAK = 3; // @ in table + private static final byte PROHIBITED_BREAK = 4; // ^ in table + private static final byte EXPLICIT_BREAK = 5; // ! in rules + private static final String notInPairTable[] = { "AI", "BK", "CB", "CR", "LF", "NL", "SA", "SG", "SP", "XX" }; + + private static final byte lineBreakProperties[] = new byte[0x10000]; + private static final Map lineBreakPropertyValues = new HashMap(); + private static final List lineBreakPropertyShortNames = new ArrayList(); + private static final List lineBreakPropertyLongNames = new ArrayList(); + + /** + * Generate a class managing line break properties for Unicode characters and a sample + * table for the table driven line breaking algorithm described in + * <a href="http://unicode.org/reports/tr14/#PairBasedImplementation">UTR #14</a>. + * TODO: Code points above the base plane are simply ignored. + * + * @param lineBreakFileName Name of line break property file (part of Unicode files). + * @param propertyValueFileName Name of property values alias file (part of Unicode files). + * @param breakPairFileName Name of pair table file (<i>not</i> part of the unicode files). + * @param outFileName Name of the output file. + * @throws Exception in case anything goes wrong. + */ + private static void convertLineBreakProperties( + String lineBreakFileName, + String propertyValueFileName, + String breakPairFileName, + String outFileName) + throws Exception { + + readLineBreakProperties(lineBreakFileName, propertyValueFileName); + // read break pair table + int lineBreakPropertyValueCount = lineBreakPropertyValues.size(); + int tableSize = lineBreakPropertyValueCount - notInPairTable.length; + Map notInPairTableMap = new HashMap(notInPairTable.length); + for (int i = 0; i < notInPairTable.length; i++) { + Object v = lineBreakPropertyValues.get(notInPairTable[i]); + if (v == null) { + throw new Exception("'not in pair table' property not found: " + notInPairTable[i]); + } + notInPairTableMap.put(notInPairTable[i], v); + } + byte pairTable[][] = new byte[tableSize][]; + byte columnHeader[] = new byte[tableSize]; + byte rowHeader[] = new byte[tableSize]; + byte columnMap[] = new byte[lineBreakPropertyValueCount + 1]; + Arrays.fill(columnMap, (byte)255); + byte rowMap[] = new byte[lineBreakPropertyValueCount + 1]; + Arrays.fill(rowMap, (byte)255); + BufferedReader b = new BufferedReader(new FileReader(breakPairFileName)); + String line = b.readLine(); + int lineNumber = 1; + // read header + if (line != null) { + StringTokenizer tok = new StringTokenizer(line); + byte columnNumber = 0; + while (tok.hasMoreTokens()) { + String name = tok.nextToken(); + if (columnNumber >= columnHeader.length) { + throw new Exception(breakPairFileName + ':' + lineNumber + ": unexpected column header " + name); + } + if (notInPairTableMap.get(name) != null) { + throw new Exception(breakPairFileName + ':' + lineNumber + ": invalid column header " + name); + } + Byte v = (Byte)lineBreakPropertyValues.get(name); + if (v != null) { + byte vv = v.byteValue(); + columnHeader[columnNumber] = vv; + columnMap[vv] = columnNumber; + } else { + throw new Exception(breakPairFileName + ':' + lineNumber + ": unknown column header " + name); + } + columnNumber++; + } + if (columnNumber < columnHeader.length) { + StringBuffer missing = new StringBuffer(); + for (int j = 0; j < lineBreakPropertyShortNames.size(); j++) { + boolean found = false; + for (int k = 0; k < columnNumber; k++) { + if (columnHeader[k] == j + 1) { + found = true; + break; + } + } + if (!found) { + if (missing.length() > 0) { + missing.append(", "); + } + missing.append((String)lineBreakPropertyShortNames.get(j)); + } + } + throw new Exception( + breakPairFileName + ':' + lineNumber + ": missing column for properties: " + missing.toString()); + } + } else { + throw new Exception(breakPairFileName + ':' + lineNumber + ": can't read table header"); + } + line = b.readLine().trim(); + lineNumber++; + byte rowNumber = 0; + while (line != null && line.length() > 0) { + if (rowNumber >= rowHeader.length) { + throw new Exception(breakPairFileName + ':' + lineNumber + ": unexpected row " + line); + } + pairTable[rowNumber] = new byte[tableSize]; + StringTokenizer tok = new StringTokenizer(line); + if (tok.hasMoreTokens()) { + String name = tok.nextToken(); + if (notInPairTableMap.get(name) != null) { + throw new Exception(breakPairFileName + ':' + lineNumber + ": invalid row header " + name); + } + Byte v = (Byte)lineBreakPropertyValues.get(name); + if (v != null) { + byte vv = v.byteValue(); + rowHeader[rowNumber] = vv; + rowMap[vv] = rowNumber; + } else { + throw new Exception(breakPairFileName + ':' + lineNumber + ": unknown row header " + name); + } + } else { + throw new Exception(breakPairFileName + ':' + lineNumber + ": can't read row header"); + } + int columnNumber = 0; + while (tok.hasMoreTokens()) { + String token = tok.nextToken(); + if (token.length() == 1) { + switch (token.charAt(0)) { + case '^' : + pairTable[rowNumber][columnNumber] = PROHIBITED_BREAK; + break; + case '%' : + pairTable[rowNumber][columnNumber] = INDIRECT_BREAK; + break; + case '_' : + pairTable[rowNumber][columnNumber] = DIRECT_BREAK; + break; + case '#' : + pairTable[rowNumber][columnNumber] = COMBINING_INDIRECT_BREAK; + break; + case '@' : + pairTable[rowNumber][columnNumber] = COMBINING_PROHIBITED_BREAK; + break; + default : + throw new Exception(breakPairFileName + ':' + lineNumber + ": unexpected token: " + token); + } + } else { + throw new Exception(breakPairFileName + ':' + lineNumber + ": token too long: " + token); + } + columnNumber++; + } + line = b.readLine().trim(); + lineNumber++; + rowNumber++; + } + if (rowNumber < rowHeader.length) { + StringBuffer missing = new StringBuffer(); + for (int j = 0; j < lineBreakPropertyShortNames.size(); j++) { + boolean found = false; + for (int k = 0; k < rowNumber; k++) { + if (rowHeader[k] == j + 1) { + found = true; + break; + } + } + if (!found) { + if (missing.length() > 0) { + missing.append(", "); + } + missing.append((String)lineBreakPropertyShortNames.get(j)); + } + } + throw new Exception( + breakPairFileName + ':' + lineNumber + ": missing row for properties: " + missing.toString()); + } + + // generate class + int rowsize = 512; + int blocksize = lineBreakProperties.length / rowsize; + byte row[][] = new byte[rowsize][]; + int idx = 0; + StringBuffer doStaticLinkCode = new StringBuffer(); + PrintWriter out = new PrintWriter(new FileWriter(outFileName)); + out.println("/*"); + out.println(" * Licensed to the Apache Software Foundation (ASF) under one or more"); + out.println(" * contributor license agreements. See the NOTICE file distributed with"); + out.println(" * this work for additional information regarding copyright ownership."); + out.println(" * The ASF licenses this file to You under the Apache License, Version 2.0"); + out.println(" * (the \"License\"); you may not use this file except in compliance with"); + out.println(" * the License. You may obtain a copy of the License at"); + out.println(" * "); + out.println(" * http://www.apache.org/licenses/LICENSE-2.0"); + out.println(" * "); + out.println(" * Unless required by applicable law or agreed to in writing, software"); + out.println(" * distributed under the License is distributed on an \"AS IS\" BASIS,"); + out.println(" * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."); + out.println(" * See the License for the specific language governing permissions and"); + out.println(" * limitations under the License."); + out.println(" */"); + out.println(); + out.println("/* $Id$ */"); + out.println(); + out.println("package org.apache.commons.text.linebreak;"); + out.println(); + out.println("/* "); + out.println(" * This is a generated file, DO NOT CHANGE!"); + out.println(" */"); + out.println(); + out.println("class LineBreakUtils {"); + out.println(); + out.println(" public static final byte DIRECT_BREAK = " + DIRECT_BREAK + ';'); + out.println(" public static final byte INDIRECT_BREAK = " + INDIRECT_BREAK + ';'); + out.println(" public static final byte COMBINING_INDIRECT_BREAK = " + COMBINING_INDIRECT_BREAK + ';'); + out.println(" public static final byte COMBINING_PROHIBITED_BREAK = " + COMBINING_PROHIBITED_BREAK + ';'); + out.println(" public static final byte PROHIBITED_BREAK = " + PROHIBITED_BREAK + ';'); + out.println(" public static final byte EXPLICIT_BREAK = " + EXPLICIT_BREAK + ';'); + out.println(); + out.println(" private static final byte PAIR_TABLE[][] = {"); + boolean printComma = false; + for (int i = 1; i <= lineBreakPropertyValueCount; i++) { + if (printComma) { + out.println(','); + } else { + printComma = true; + } + out.print(" {"); + boolean localPrintComma = false; + for (int j = 1; j <= lineBreakPropertyValueCount; j++) { + if (localPrintComma) { + out.print(','); + } else { + localPrintComma = true; + } + if (columnMap[j] != -1 && rowMap[i] != -1) { + out.print(pairTable[rowMap[i]][columnMap[j]]); + } else { + out.print('0'); + } + } + out.print('}'); + } + out.println("};"); + out.println(); + out.println(" private static byte lineBreakProperties[][] = new byte[" + rowsize + "][];"); + out.println(); + out.println(" private static void init_0() {"); + int rowsPrinted = 0; + int initSections = 0; + for (int i = 0; i < rowsize; i++) { + boolean found = false; + for (int j = 0; j < i; j++) { + if (row[j] != null) { + boolean matched = true; + for (int k = 0; k < blocksize; k++) { + if (row[j][k] != lineBreakProperties[idx + k]) { + matched = false; + break; + } + } + if (matched) { + found = true; + doStaticLinkCode.append(" lineBreakProperties["); + doStaticLinkCode.append(i); + doStaticLinkCode.append("]=lineBreakProperties["); + doStaticLinkCode.append(j); + doStaticLinkCode.append("];\n"); + break; + } + } + } + if (!found) { + if (rowsPrinted >= 64) { + out.println(" };"); + out.println(); + initSections++; + out.println(" private static void init_" + initSections + "() {"); + rowsPrinted = 0; + } + row[i] = new byte[blocksize]; + boolean printLocalComma = false; + out.print(" lineBreakProperties[" + i + "] = new byte[] { "); + for (int k = 0; k < blocksize; k++) { + row[i][k] = lineBreakProperties[idx + k]; + if (printLocalComma) { + out.print(','); + } else { + printLocalComma = true; + } + out.print(row[i][k]); + } + out.println("};"); + rowsPrinted++; + } + idx += blocksize; + } + out.println(" };"); + out.println(); + out.println(" static {"); + for (int i = 0; i <= initSections; i++) { + out.println(" init_" + i + "();"); + } + out.print(doStaticLinkCode); + out.println(" };"); + out.println(); + for (int i = 0; i < lineBreakPropertyShortNames.size(); i++) { + String shortName = (String)lineBreakPropertyShortNames.get(i); + out.print(" public static final byte LINE_BREAK_PROPERTY_"); + out.print(shortName); + out.print('='); + out.print(i + 1); + out.println(';'); + } + out.println(); + final String shortNamePrefix = " private static String lineBreakPropertyShortNames[] = {"; + out.print(shortNamePrefix); + int lineLength = shortNamePrefix.length(); + printComma = false; + for (int i = 0; i < lineBreakPropertyShortNames.size(); i++) { + String name = (String)lineBreakPropertyShortNames.get(i); + if (printComma) { + out.print(','); + lineLength++; + } else { + printComma = true; + } + if (lineLength > MAX_LINE_LENGTH) { + out.println(); + out.print(" "); + lineLength = 8; + } + out.print('"'); + out.print(name); + out.print('"'); + lineLength += (2 + name.length()); + } + out.println("};"); + out.println(); + final String longNamePrefix = " private static String lineBreakPropertyLongNames[] = {"; + out.print(longNamePrefix); + lineLength = longNamePrefix.length(); + printComma = false; + for (int i = 0; i < lineBreakPropertyLongNames.size(); i++) { + String name = (String)lineBreakPropertyLongNames.get(i); + if (printComma) { + out.print(','); + lineLength++; + } else { + printComma = true; + } + if (lineLength > MAX_LINE_LENGTH) { + out.println(); + out.print(" "); + lineLength = 8; + } + out.print('"'); + out.print(name); + out.print('"'); + lineLength += (2 + name.length()); + } + out.println("};"); + out.println(); + out.println(" public static String getLineBreakPropertyShortName(byte i) {"); + out.println(" if (i>0 && i<=lineBreakPropertyShortNames.length) {"); + out.println(" return lineBreakPropertyShortNames[i-1];"); + out.println(" } else {"); + out.println(" return null;"); + out.println(" }"); + out.println(" }"); + out.println(); + out.println(" public static String getLineBreakPropertyLongName(byte i) {"); + out.println(" if (i>0 && i<=lineBreakPropertyLongNames.length) {"); + out.println(" return lineBreakPropertyLongNames[i-1];"); + out.println(" } else {"); + out.println(" return null;"); + out.println(" }"); + out.println(" }"); + out.println(); + out.println(" public static byte getLineBreakProperty(char c) {"); + out.println(" return lineBreakProperties[c/" + blocksize + "][c%" + blocksize + "];"); + out.println(" }"); + out.println(); + out.println( + " public static byte getLineBreakPairProperty(int lineBreakPropertyBefore,int lineBreakPropertyAfter) {"); + out.println(" return PAIR_TABLE[lineBreakPropertyBefore-1][lineBreakPropertyAfter-1];"); + out.println(" }"); + out.println(); + out.println("};"); + out.flush(); + out.close(); + } + + /** + * Read line break property value names and the actual properties for the Unicode + * characters from the respective Unicode files. + * TODO: Code points above the base plane are simply ignored. + * + * @param lineBreakFileName Name of line break property file. + * @param propertyValueFileName Name of property values alias file. + * @throws Exception in case anything goes wrong. + */ + private static void readLineBreakProperties(String lineBreakFileName, String propertyValueFileName) + throws Exception { + // read property names + BufferedReader b = new BufferedReader(new InputStreamReader(new URL(propertyValueFileName).openStream())); + String line = b.readLine(); + int lineNumber = 1; + byte propertyIndex = 1; + byte indexForUnknown = 0; + while (line != null) { + if (line.startsWith("lb")) { + String shortName; + String longName = null; + int semi = line.indexOf(';'); + if (semi < 0) { + throw new Exception( + propertyValueFileName + ':' + lineNumber + ": missing property short name in " + line); + } + line = line.substring(semi + 1); + semi = line.indexOf(';'); + if (semi > 0) { + shortName = line.substring(0, semi).trim(); + longName = line.substring(semi + 1).trim(); + semi = longName.indexOf(';'); + if (semi > 0) { + longName = longName.substring(0, semi).trim(); + } + } else { + shortName = line.trim(); + } + if (shortName.equals("XX")) { + indexForUnknown = propertyIndex; + } + lineBreakPropertyValues.put(shortName, new Byte((byte)propertyIndex)); + lineBreakPropertyShortNames.add(shortName); + lineBreakPropertyLongNames.add(longName); + propertyIndex++; + if (propertyIndex <= 0) { + throw new Exception(propertyValueFileName + ':' + lineNumber + ": property rolled over in " + line); + } + } + line = b.readLine(); + lineNumber++; + } + if (indexForUnknown == 0) { + throw new Exception("index for XX (unknown) line break property value not found"); + } + + // read property values + Arrays.fill(lineBreakProperties, (byte)0); + b = new BufferedReader(new InputStreamReader(new URL(lineBreakFileName).openStream())); + line = b.readLine(); + lineNumber = 1; + while (line != null) { + int idx = line.indexOf('#'); + if (idx >= 0) { + line = line.substring(0, idx); + } + line = line.trim(); + if (line.length() > 0) { + idx = line.indexOf(';'); + if (idx <= 0) { + throw new Exception(lineBreakFileName + ':' + lineNumber + ": No field delimiter in " + line); + } + Byte v = (Byte)lineBreakPropertyValues.get(line.substring(idx + 1).trim()); + if (v == null) { + throw new Exception(lineBreakFileName + ':' + lineNumber + ": Unknown property value in " + line); + } + String codepoint = line.substring(0, idx); + int low, high; + idx = codepoint.indexOf(".."); + try { + if (idx >= 0) { + low = Integer.parseInt(codepoint.substring(0, idx), 16); + high = Integer.parseInt(codepoint.substring(idx + 2), 16); + } else { + low = Integer.parseInt(codepoint, 16); + high = low; + } + } catch (NumberFormatException e) { + throw new Exception(lineBreakFileName + ':' + lineNumber + ": Invalid codepoint number in " + line); + } + if (high > 0xFFFF) { + // ignore non-baseplane characters for now + + } else { + if (low < 0 || high < 0) { + throw new Exception( + lineBreakFileName + ':' + lineNumber + ": Negative codepoint(s) in " + line); + } + byte vv = v.byteValue(); + for (int i = low; i <= high; i++) { + if (lineBreakProperties[i] != 0) { + throw new Exception( + lineBreakFileName + + ':' + + lineNumber + + ": Property already set for " + + ((char)i) + + " in " + + line); + } + lineBreakProperties[i] = vv; + } + } + } + line = b.readLine(); + lineNumber++; + } + } + + /** + * Determine a good block size for the two stage optimized storage of the + * line breaking properties. Note: the memory utilization calculation is a rule of thumb, + * don't take it too serious. + * + * @param lineBreakFileName Name of line break property file. + * @param propertyValueFileName Name of property values alias file. + * @throws Exception in case anything goes wrong. + */ + private static void optimizeBlocks(String lineBreakFileName, String propertyValueFileName) throws Exception { + readLineBreakProperties(lineBreakFileName, propertyValueFileName); + for (int i = 0; i < 16; i++) { + int rowsize = 1 << i; + int blocksize = lineBreakProperties.length / (rowsize); + byte row[][] = new byte[rowsize][]; + int idx = 0; + int nrOfDistinctBlocks = 0; + for (int j = 0; j < rowsize; j++) { + byte block[] = new byte[blocksize]; + for (int k = 0; k < blocksize; k++) { + block[k] = lineBreakProperties[idx]; + idx++; + } + boolean found = false; + for (int k = 0; k < j; k++) { + if (row[k] != null) { + boolean matched = true; + for (int l = 0; l < blocksize; l++) { + if (row[k][l] != block[l]) { + matched = false; + break; + } + } + if (matched) { + found = true; + break; + } + } + } + if (!found) { + row[j] = block; + nrOfDistinctBlocks++; + } else { + row[j] = null; + } + } + int size = rowsize * 4 + nrOfDistinctBlocks * blocksize; + System.out.println( + "i=" + i + " blocksize=" + blocksize + " blocks=" + nrOfDistinctBlocks + " size=" + size); + } + } + + public static void main(String[] args) { + String lineBreakFileName = "http://www.unicode.org/Public/UNIDATA/LineBreak.txt"; + String propertyValueFileName = "http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt"; + String breakPairFileName = "src/codegen/unicode/data/LineBreakPairTable.txt"; + String outFileName = "LineBreakUtils.java"; + boolean ok = true; + for (int i = 0; i < args.length; i = i + 2) { + if (i + 1 == args.length) { + ok = false; + } else { + String opt = args[i]; + if ("-l".equals(opt)) { + lineBreakFileName = args[i+1]; + } else if ("-p".equals(opt)) { + propertyValueFileName = args[i+1]; + } else if ("-b".equals(opt)) { + breakPairFileName = args[i+1]; + } else if("-o".equals(opt)) { + outFileName = args[i+1]; + } else { + ok = false; + } + } + } + if (!ok) { + System.out.println("Usage: GenerateLineBreakUtils [-l <lineBreakFile>] [-p <propertyValueFile>] [-b <breakPairFile>] [-o <outputFile>]"); + System.out.println(" defaults:"); + System.out.println(" <lineBreakFile>: " + lineBreakFileName); + System.out.println(" <propertyValueFile>: " + propertyValueFileName); + System.out.println(" <breakPairFile>: " + breakPairFileName); + System.out.println(" <outputFile>: " + outFileName); + } else { + try { + convertLineBreakProperties(lineBreakFileName, propertyValueFileName, breakPairFileName, outFileName); + System.out.println("Generated " + outFileName + " from"); + System.out.println(" <lineBreakFile>: " + lineBreakFileName); + System.out.println(" <propertyValueFile>: " + propertyValueFileName); + System.out.println(" <breakPairFile>: " + breakPairFileName); + } catch (Exception e) { + System.out.println("An unexpected error occured"); + e.printStackTrace(); + } + } + } +} |