From 8ec8f0c0c6c68d9b13c3bc3416c3234eddd48379 Mon Sep 17 00:00:00 2001 From: jhugunin Date: Fri, 3 Jan 2003 23:19:47 +0000 Subject: making jython-2.1 available for scripting --- lib/jython/Lib/xmllib.py | 929 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 929 insertions(+) create mode 100644 lib/jython/Lib/xmllib.py (limited to 'lib/jython/Lib/xmllib.py') diff --git a/lib/jython/Lib/xmllib.py b/lib/jython/Lib/xmllib.py new file mode 100644 index 000000000..b5e58560c --- /dev/null +++ b/lib/jython/Lib/xmllib.py @@ -0,0 +1,929 @@ +"""A parser for XML, using the derived class as static DTD.""" + +# Author: Sjoerd Mullender. + +import re +import string + + +version = '0.3' + +class Error(RuntimeError): + pass + +# Regular expressions used for parsing + +_S = '[ \t\r\n]+' # white space +_opS = '[ \t\r\n]*' # optional white space +_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name +_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML string +illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content +interesting = re.compile('[]&<]') + +amp = re.compile('&') +ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]') +entityref = re.compile('&(?P' + _Name + ')[^-a-zA-Z0-9._:]') +charref = re.compile('&#(?P[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') +space = re.compile(_S + '$') +newline = re.compile('\n') + +attrfind = re.compile( + _S + '(?P' + _Name + ')' + '(' + _opS + '=' + _opS + + '(?P'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?') +starttagopen = re.compile('<' + _Name) +starttagend = re.compile(_opS + '(?P/?)>') +starttagmatch = re.compile('<(?P'+_Name+')' + '(?P(?:'+attrfind.pattern+')*)'+ + starttagend.pattern) +endtagopen = re.compile('') +endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>') +tagfind = re.compile(_Name) +cdataopen = re.compile(r'') +# this matches one of the following: +# SYSTEM SystemLiteral +# PUBLIC PubidLiteral SystemLiteral +_SystemLiteral = '(?P<%s>'+_QStr+')' +_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ + "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')" +_ExternalId = '(?:SYSTEM|' \ + 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \ + ')'+_S+_SystemLiteral%'syslit' +doctype = re.compile(''+_Name+')' + '(?:'+_S+_ExternalId+')?'+_opS) +xmldecl = re.compile('<\?xml'+_S+ + 'version'+_opS+'='+_opS+'(?P'+_QStr+')'+ + '(?:'+_S+'encoding'+_opS+'='+_opS+ + "(?P'[A-Za-z][-A-Za-z0-9._]*'|" + '"[A-Za-z][-A-Za-z0-9._]*"))?' + '(?:'+_S+'standalone'+_opS+'='+_opS+ + '(?P\'(?:yes|no)\'|"(?:yes|no)"))?'+ + _opS+'\?>') +procopen = re.compile(r'<\?(?P' + _Name + ')' + _opS) +procclose = re.compile(_opS + r'\?>') +commentopen = re.compile('') +doubledash = re.compile('--') +attrtrans = string.maketrans(' \r\n\t', ' ') + +# definitions for XML namespaces +_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":" +ncname = re.compile(_NCName + '$') +qname = re.compile('(?:(?P' + _NCName + '):)?' # optional prefix + '(?P' + _NCName + ')$') + +xmlns = re.compile('xmlns(?::(?P'+_NCName+'))?$') + +# XML parser base class -- find tags and call handler functions. +# Usage: p = XMLParser(); p.feed(data); ...; p.close(). +# The dtd is defined by deriving a class which defines methods with +# special names to handle tags: start_foo and end_foo to handle +# and , respectively. The data between tags is passed to the +# parser by calling self.handle_data() with some data as argument (the +# data may be split up in arbitrary chunks). + +class XMLParser: + attributes = {} # default, to be overridden + elements = {} # default, to be overridden + + # parsing options, settable using keyword args in __init__ + __accept_unquoted_attributes = 0 + __accept_missing_endtag_name = 0 + __map_case = 0 + __accept_utf8 = 0 + __translate_attribute_references = 1 + + # Interface -- initialize and reset this instance + def __init__(self, **kw): + self.__fixed = 0 + if kw.has_key('accept_unquoted_attributes'): + self.__accept_unquoted_attributes = kw['accept_unquoted_attributes'] + if kw.has_key('accept_missing_endtag_name'): + self.__accept_missing_endtag_name = kw['accept_missing_endtag_name'] + if kw.has_key('map_case'): + self.__map_case = kw['map_case'] + if kw.has_key('accept_utf8'): + self.__accept_utf8 = kw['accept_utf8'] + if kw.has_key('translate_attribute_references'): + self.__translate_attribute_references = kw['translate_attribute_references'] + self.reset() + + def __fixelements(self): + self.__fixed = 1 + self.elements = {} + self.__fixdict(self.__dict__) + self.__fixclass(self.__class__) + + def __fixclass(self, kl): + self.__fixdict(kl.__dict__) + for k in kl.__bases__: + self.__fixclass(k) + + def __fixdict(self, dict): + for key in dict.keys(): + if key[:6] == 'start_': + tag = key[6:] + start, end = self.elements.get(tag, (None, None)) + if start is None: + self.elements[tag] = getattr(self, key), end + elif key[:4] == 'end_': + tag = key[4:] + start, end = self.elements.get(tag, (None, None)) + if end is None: + self.elements[tag] = start, getattr(self, key) + + # Interface -- reset this instance. Loses all unprocessed data + def reset(self): + self.rawdata = '' + self.stack = [] + self.nomoretags = 0 + self.literal = 0 + self.lineno = 1 + self.__at_start = 1 + self.__seen_doctype = None + self.__seen_starttag = 0 + self.__use_namespaces = 0 + self.__namespaces = {'xml':None} # xml is implicitly declared + # backward compatibility hack: if elements not overridden, + # fill it in ourselves + if self.elements is XMLParser.elements: + self.__fixelements() + + # For derived classes only -- enter literal mode (CDATA) till EOF + def setnomoretags(self): + self.nomoretags = self.literal = 1 + + # For derived classes only -- enter literal mode (CDATA) + def setliteral(self, *args): + self.literal = 1 + + # Interface -- feed some data to the parser. Call this as + # often as you want, with as little or as much text as you + # want (may include '\n'). (This just saves the text, all the + # processing is done by goahead().) + def feed(self, data): + self.rawdata = self.rawdata + data + self.goahead(0) + + # Interface -- handle the remaining data + def close(self): + self.goahead(1) + if self.__fixed: + self.__fixed = 0 + # remove self.elements so that we don't leak + del self.elements + + # Interface -- translate references + def translate_references(self, data, all = 1): + if not self.__translate_attribute_references: + return data + i = 0 + while 1: + res = amp.search(data, i) + if res is None: + return data + s = res.start(0) + res = ref.match(data, s) + if res is None: + self.syntax_error("bogus `&'") + i = s+1 + continue + i = res.end(0) + str = res.group(1) + rescan = 0 + if str[0] == '#': + if str[1] == 'x': + str = chr(int(str[2:], 16)) + else: + str = chr(int(str[1:])) + if data[i - 1] != ';': + self.syntax_error("`;' missing after char reference") + i = i-1 + elif all: + if self.entitydefs.has_key(str): + str = self.entitydefs[str] + rescan = 1 + elif data[i - 1] != ';': + self.syntax_error("bogus `&'") + i = s + 1 # just past the & + continue + else: + self.syntax_error("reference to unknown entity `&%s;'" % str) + str = '&' + str + ';' + elif data[i - 1] != ';': + self.syntax_error("bogus `&'") + i = s + 1 # just past the & + continue + + # when we get here, str contains the translated text and i points + # to the end of the string that is to be replaced + data = data[:s] + str + data[i:] + if rescan: + i = s + else: + i = s + len(str) + + # Interface - return a dictionary of all namespaces currently valid + def getnamespace(self): + nsdict = {} + for t, d, nst in self.stack: + nsdict.update(d) + return nsdict + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if i > 0: + self.__at_start = 0 + if self.nomoretags: + data = rawdata[i:n] + self.handle_data(data) + self.lineno = self.lineno + data.count('\n') + i = n + break + res = interesting.search(rawdata, i) + if res: + j = res.start(0) + else: + j = n + if i < j: + data = rawdata[i:j] + if self.__at_start and space.match(data) is None: + self.syntax_error('illegal data at start of file') + self.__at_start = 0 + if not self.stack and space.match(data) is None: + self.syntax_error('data not in content') + if not self.__accept_utf8 and illegal.search(data): + self.syntax_error('illegal character in content') + self.handle_data(data) + self.lineno = self.lineno + data.count('\n') + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i): + if self.literal: + data = rawdata[i] + self.handle_data(data) + self.lineno = self.lineno + data.count('\n') + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + self.__seen_starttag = 1 + self.lineno = self.lineno + rawdata[i:k].count('\n') + i = k + continue + if endtagopen.match(rawdata, i): + k = self.parse_endtag(i) + if k < 0: break + self.lineno = self.lineno + rawdata[i:k].count('\n') + i = k + continue + if commentopen.match(rawdata, i): + if self.literal: + data = rawdata[i] + self.handle_data(data) + self.lineno = self.lineno + data.count('\n') + i = i+1 + continue + k = self.parse_comment(i) + if k < 0: break + self.lineno = self.lineno + rawdata[i:k].count('\n') + i = k + continue + if cdataopen.match(rawdata, i): + k = self.parse_cdata(i) + if k < 0: break + self.lineno = self.lineno + rawdata[i:k].count('\n') + i = k + continue + res = xmldecl.match(rawdata, i) + if res: + if not self.__at_start: + self.syntax_error(" declaration not at start of document") + version, encoding, standalone = res.group('version', + 'encoding', + 'standalone') + if version[1:-1] != '1.0': + raise Error('only XML version 1.0 supported') + if encoding: encoding = encoding[1:-1] + if standalone: standalone = standalone[1:-1] + self.handle_xml(encoding, standalone) + i = res.end(0) + continue + res = procopen.match(rawdata, i) + if res: + k = self.parse_proc(i) + if k < 0: break + self.lineno = self.lineno + rawdata[i:k].count('\n') + i = k + continue + res = doctype.match(rawdata, i) + if res: + if self.literal: + data = rawdata[i] + self.handle_data(data) + self.lineno = self.lineno + data.count('\n') + i = i+1 + continue + if self.__seen_doctype: + self.syntax_error('multiple DOCTYPE elements') + if self.__seen_starttag: + self.syntax_error('DOCTYPE not at beginning of document') + k = self.parse_doctype(res) + if k < 0: break + self.__seen_doctype = res.group('name') + if self.__map_case: + self.__seen_doctype = self.__seen_doctype.lower() + self.lineno = self.lineno + rawdata[i:k].count('\n') + i = k + continue + elif rawdata[i] == '&': + if self.literal: + data = rawdata[i] + self.handle_data(data) + i = i+1 + continue + res = charref.match(rawdata, i) + if res is not None: + i = res.end(0) + if rawdata[i-1] != ';': + self.syntax_error("`;' missing in charref") + i = i-1 + if not self.stack: + self.syntax_error('data not in content') + self.handle_charref(res.group('char')[:-1]) + self.lineno = self.lineno + res.group(0).count('\n') + continue + res = entityref.match(rawdata, i) + if res is not None: + i = res.end(0) + if rawdata[i-1] != ';': + self.syntax_error("`;' missing in entityref") + i = i-1 + name = res.group('name') + if self.__map_case: + name = name.lower() + if self.entitydefs.has_key(name): + self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:] + n = len(rawdata) + i = res.start(0) + else: + self.unknown_entityref(name) + self.lineno = self.lineno + res.group(0).count('\n') + continue + elif rawdata[i] == ']': + if self.literal: + data = rawdata[i] + self.handle_data(data) + i = i+1 + continue + if n-i < 3: + break + if cdataclose.match(rawdata, i): + self.syntax_error("bogus `]]>'") + self.handle_data(rawdata[i]) + i = i+1 + continue + else: + raise Error('neither < nor & ??') + # We get here only if incomplete matches but + # nothing else + break + # end while + if i > 0: + self.__at_start = 0 + if end and i < n: + data = rawdata[i] + self.syntax_error("bogus `%s'" % data) + if not self.__accept_utf8 and illegal.search(data): + self.syntax_error('illegal character in content') + self.handle_data(data) + self.lineno = self.lineno + data.count('\n') + self.rawdata = rawdata[i+1:] + return self.goahead(end) + self.rawdata = rawdata[i:] + if end: + if not self.__seen_starttag: + self.syntax_error('no elements in file') + if self.stack: + self.syntax_error('missing end tags') + while self.stack: + self.finish_endtag(self.stack[-1][0]) + + # Internal -- parse comment, return length or -1 if not terminated + def parse_comment(self, i): + rawdata = self.rawdata + if rawdata[i:i+4] != '