lib/jython/Lib/xml/parsers/xmlproc/charconv.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

# Some experiments in adding character encoding conversions to xmlproc.
# This module is not yet used by the released xmlproc, since I'm awaiting
# a reorganization.
#
# $Id: charconv.py,v 1.5 2000/09/26 14:43:10 loewis Exp $

import string

# --- Conversion tables

# CP 850 to ISO 8859-1

# First element is no. 128, second 129 ...
# The non-ISO characters, such as <empty set>, are mapped to non-ISO chars
# 127-145 and 147-159 in the order they appear in CP 850. Since there are
# more non-ISO chars than there is room for in these intervals, some of
# the last chars are also mapped to 159.
    
cp850_iso=[199,252,233,226,228,224,229,231,234,235,232,239,238,236,196,197,
           201,230,198,244,246,242,251,249,255,246,220,248,163,127,215,128,
           225,237,243,250,241,209,170,186,191,174,172,189,188,161,171,187,
           129,130,131,132,133,193,194,192,169,134,135,136,137,162,165,138,
           139,140,141,142,143,144,227,195,145,147,148,149,150,151,152,164,
           240,208,202,203,200,153,205,206,207,154,155,156,157,166,204,158,
           211,223,212,210,245,213,181,222,254,218,219,217,253,221,175,180,
           173,177,159,190,182,167,247,184,176,168,159,185,179,178,159,160]

cp850_iso_tbl=""
for ix in range(128):
    cp850_iso_tbl=cp850_iso_tbl+chr(ix)
for chno in cp850_iso:
    cp850_iso_tbl=cp850_iso_tbl+chr(chno)

# ISO 8859-1 to CP 850
    
iso_cp850=[0]*256
for ix in range(256):
    iso_cp850[ord(cp850_iso_tbl[ix])]=ix

iso_cp850_tbl=""
for chno in iso_cp850:
    iso_cp850_tbl=iso_cp850_tbl+chr(chno)

# Windows CP 1252 to ISO 8859-1
# Maps characters 128-159, 63 means non-mappable, 127 means unused in 1252
# Does a fuzzy transform (ndash and mdash both mapped to -, and so on)

cp1252_iso=[127,127,44,63,63,95,63,63,94,63,63,60,198,127,127,127,127,39,39,
            34,34,183,45,45,126,63,63,62,230,127,127,127]

cp1252_iso_tbl=""
for char in map(chr,range(128)+cp1252_iso+range(160,256)):
    cp1252_iso_tbl=cp1252_iso_tbl+char

# --- Conversion functions

def utf8_to_iso8859(data):
    out=""

    ix=0
    for ix in range(len(data)):
        chn=ord(data[ix])
        if chn & 224==192: # 110xxxxx
            out=out+chr( ((chn & 3) << 6) + (ord(data[ix+1]) & 63))
        elif chn & 128==0: # 0xxxxxxx
            out=out+data[ix]

    return out

def iso8859_to_utf8(data):
    out=""

    for ch in data:
        if ord(ch)<128:
            out=out+ch
        else:
            chno=ord(ch)
            out=out+chr(192+((chno & 192)>>6))+chr(128+(chno & 63))

    return out

def cp850_to_iso8859(data):
    return string.translate(data,cp850_iso_tbl)

def iso8859_to_cp850(data):
    return string.translate(data,iso_cp850_tbl)

def id_conv(data):
    return data

def cp850_to_utf8(data):
    return iso8859_to_utf8(cp850_to_iso8859(data))

def utf8_to_cp850(data):
    return iso8859_to_cp850(utf8_to_iso8859(data))

def cp1252_to_iso8859(data):
    return string.translate(data,cp1252_iso_tbl)

# --- Conversion function database

class ConverterDatabase:
    """This class knows about all registered converting functions, and can be
    queried for information about converters."""

    def __init__(self):
        self.__map={}
        self.__alias_map={}

    def add_alias(self,canonical,alias):
        "Adds an alias for a character set."
        self.__alias_map[string.lower(alias)]=string.lower(canonical)
        
    def can_convert(self,from_encoding,to_encoding):
        """Returns true if converters to from from_encoding to to_encoding are
        known. Encoding names follow the syntax specified by the XML rec."""
        from_encoding=self._canonize_name(from_encoding)
        to_encoding=self._canonize_name(to_encoding)

        if from_encoding==to_encoding:
            return 1
        
        try:
            return self.__map[from_encoding].has_key(to_encoding)
        except KeyError:
            return 0

    def get_converter(self,from_encoding,to_encoding):
        """Returns a converter function that converts from the character
        encoding from_encoding to to_encoding. A KeyError will be thrown
        if no converter is known."""
        from_encoding=self._canonize_name(from_encoding)
        to_encoding=self._canonize_name(to_encoding)

        if from_encoding==to_encoding:
            return id_conv
        else:
            return self.__map[from_encoding][to_encoding]

    def add_converter(self,from_encoding,to_encoding,converter):
        from_encoding=self._canonize_name(from_encoding)
        to_encoding=self._canonize_name(to_encoding)
        
        if not self.__map.has_key(from_encoding):
            self.__map[from_encoding]={}

        self.__map[from_encoding][to_encoding]=converter

    def _canonize_name(self,name):
        "Returns the canonical form of a charset name."
        name=string.lower(name)
        if self.__alias_map.has_key(name):
            return self.__alias_map[name]
        else:
            return name
        
# --- Globals

convdb=ConverterDatabase()
convdb.add_alias("US-ASCII","ANSI_X3.4-1968")
convdb.add_alias("US-ASCII","iso-ir-6")
convdb.add_alias("US-ASCII","ANSI_X3.4-1986")
convdb.add_alias("US-ASCII","ISO_646.irv:1991")
convdb.add_alias("US-ASCII","ASCII")
convdb.add_alias("US-ASCII","ISO646-US")
convdb.add_alias("US-ASCII","us")
convdb.add_alias("US-ASCII","IBM367")
convdb.add_alias("US-ASCII","cp367")
convdb.add_alias("US-ASCII","csASCII")

convdb.add_alias("ISO-8859-1","ISO_8859-1:1987")
convdb.add_alias("ISO-8859-1","iso-ir-100")
convdb.add_alias("ISO-8859-1","ISO_8859-1")
convdb.add_alias("ISO-8859-1","latin1")
convdb.add_alias("ISO-8859-1","l1")
convdb.add_alias("ISO-8859-1","IBM819")
convdb.add_alias("ISO-8859-1","CP819")
convdb.add_alias("ISO-8859-1","csISOLatin1")

convdb.add_alias("IBM850","cp850")
convdb.add_alias("IBM850","850")
convdb.add_alias("IBM850","csPC850Multilingual")

# converters (foo -> foo case not needed, handled automatically)

convdb.add_converter("IBM850","ISO-8859-1",cp850_to_iso8859)
convdb.add_converter("US-ASCII","ISO-8859-1",id_conv)
convdb.add_converter("windows-1252","ISO-8859-1",cp1252_to_iso8859)

convdb.add_converter("ISO-8859-1","IBM850",iso8859_to_cp850)
convdb.add_converter("US-ASCII","IBM850",id_conv)

convdb.add_converter("ISO-8859-1","WINDOWS-1252",id_conv)

convdb.add_converter("US-ASCII","UTF-8",id_conv)

# UTF-8 stuff disabled due to total lack of speed

# convdb.add_converter("UTF-8","ISO-8859-1",utf8_to_iso8859)
# convdb.add_converter("ISO-8859-1","UTF-8",iso8859_to_utf8)
# convdb.add_converter("UTF-8","IBM850",utf8_to_cp850)
# convdb.add_converter("IBM850","UTF-8",cp850_to_utf8)