You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

saxutils.py 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661
  1. """
  2. A library of useful helper classes to the saxlib classes, for the
  3. convenience of application and driver writers.
  4. $Id: saxutils.py,v 1.19 2001/03/20 07:19:46 loewis Exp $
  5. """
  6. import types, sys, urllib, urlparse, os, string
  7. import handler, _exceptions, xmlreader
  8. try:
  9. _StringTypes = [types.StringType, types.UnicodeType]
  10. except AttributeError: # 1.5 compatibility:UnicodeType not defined
  11. _StringTypes = [types.StringType]
  12. def escape(data, entities={}):
  13. """Escape &, <, and > in a string of data.
  14. You can escape other strings of data by passing a dictionary as
  15. the optional entities parameter. The keys and values must all be
  16. strings; each key will be replaced with its corresponding value.
  17. """
  18. data = string.replace(data, "&", "&amp;")
  19. data = string.replace(data, "<", "&lt;")
  20. data = string.replace(data, ">", "&gt;")
  21. for chars, entity in entities.items():
  22. data = string.replace(data, chars, entity)
  23. return data
  24. # --- DefaultHandler
  25. class DefaultHandler(handler.EntityResolver, handler.DTDHandler,
  26. handler.ContentHandler, handler.ErrorHandler):
  27. """Default base class for SAX2 event handlers. Implements empty
  28. methods for all callback methods, which can be overridden by
  29. application implementors. Replaces the deprecated SAX1 HandlerBase
  30. class."""
  31. # --- Location
  32. class Location:
  33. """Represents a location in an XML entity. Initialized by being passed
  34. a locator, from which it reads off the current location, which is then
  35. stored internally."""
  36. def __init__(self, locator):
  37. self.__col = locator.getColumnNumber()
  38. self.__line = locator.getLineNumber()
  39. self.__pubid = locator.getPublicId()
  40. self.__sysid = locator.getSystemId()
  41. def getColumnNumber(self):
  42. return self.__col
  43. def getLineNumber(self):
  44. return self.__line
  45. def getPublicId(self):
  46. return self.__pubid
  47. def getSystemId(self):
  48. return self.__sysid
  49. # --- ErrorPrinter
  50. class ErrorPrinter:
  51. "A simple class that just prints error messages to standard out."
  52. def __init__(self, level=0, outfile=sys.stderr):
  53. self._level = level
  54. self._outfile = outfile
  55. def warning(self, exception):
  56. if self._level <= 0:
  57. self._outfile.write("WARNING in %s: %s\n" %
  58. (self.__getpos(exception),
  59. exception.getMessage()))
  60. def error(self, exception):
  61. if self._level <= 1:
  62. self._outfile.write("ERROR in %s: %s\n" %
  63. (self.__getpos(exception),
  64. exception.getMessage()))
  65. def fatalError(self, exception):
  66. if self._level <= 2:
  67. self._outfile.write("FATAL ERROR in %s: %s\n" %
  68. (self.__getpos(exception),
  69. exception.getMessage()))
  70. def __getpos(self, exception):
  71. if isinstance(exception, _exceptions.SAXParseException):
  72. return "%s:%s:%s" % (exception.getSystemId(),
  73. exception.getLineNumber(),
  74. exception.getColumnNumber())
  75. else:
  76. return "<unknown>"
  77. # --- ErrorRaiser
  78. class ErrorRaiser:
  79. "A simple class that just raises the exceptions it is passed."
  80. def __init__(self, level = 0):
  81. self._level = level
  82. def error(self, exception):
  83. if self._level <= 1:
  84. raise exception
  85. def fatalError(self, exception):
  86. if self._level <= 2:
  87. raise exception
  88. def warning(self, exception):
  89. if self._level <= 0:
  90. raise exception
  91. # --- AttributesImpl now lives in xmlreader
  92. from xmlreader import AttributesImpl
  93. # --- XMLGenerator is the SAX2 ContentHandler for writing back XML
  94. try:
  95. import codecs
  96. def _outputwrapper(stream,encoding):
  97. writerclass = codecs.lookup(encoding)[3]
  98. return writerclass(stream)
  99. except ImportError: # 1.5 compatibility: fall back to do-nothing
  100. def _outputwrapper(stream,encoding):
  101. return stream
  102. class XMLGenerator(handler.ContentHandler):
  103. def __init__(self, out=None, encoding="iso-8859-1"):
  104. if out is None:
  105. import sys
  106. out = sys.stdout
  107. handler.ContentHandler.__init__(self)
  108. self._out = _outputwrapper(out,encoding)
  109. self._ns_contexts = [{}] # contains uri -> prefix dicts
  110. self._current_context = self._ns_contexts[-1]
  111. self._undeclared_ns_maps = []
  112. self._encoding = encoding
  113. # ContentHandler methods
  114. def startDocument(self):
  115. self._out.write('<?xml version="1.0" encoding="%s"?>\n' %
  116. self._encoding)
  117. def startPrefixMapping(self, prefix, uri):
  118. self._ns_contexts.append(self._current_context.copy())
  119. self._current_context[uri] = prefix
  120. self._undeclared_ns_maps.append((prefix, uri))
  121. def endPrefixMapping(self, prefix):
  122. self._current_context = self._ns_contexts[-1]
  123. del self._ns_contexts[-1]
  124. def startElement(self, name, attrs):
  125. self._out.write('<' + name)
  126. for (name, value) in attrs.items():
  127. self._out.write(' %s="%s"' % (name, escape(value)))
  128. self._out.write('>')
  129. def endElement(self, name):
  130. self._out.write('</%s>' % name)
  131. def startElementNS(self, name, qname, attrs):
  132. if name[0] is None:
  133. name = name[1]
  134. elif self._current_context[name[0]] is None:
  135. # default namespace
  136. name = name[1]
  137. else:
  138. name = self._current_context[name[0]] + ":" + name[1]
  139. self._out.write('<' + name)
  140. for k,v in self._undeclared_ns_maps:
  141. if k is None:
  142. self._out.write(' xmlns="%s"' % v)
  143. else:
  144. self._out.write(' xmlns:%s="%s"' % (k,v))
  145. self._undeclared_ns_maps = []
  146. for (name, value) in attrs.items():
  147. name = self._current_context[name[0]] + ":" + name[1]
  148. self._out.write(' %s="%s"' % (name, escape(value)))
  149. self._out.write('>')
  150. def endElementNS(self, name, qname):
  151. # XXX: if qname is not None, we better use it.
  152. # Python 2.0b2 requires us to use the recorded prefix for
  153. # name[0], though
  154. if name[0] is None:
  155. qname = name[1]
  156. elif self._current_context[name[0]] is None:
  157. qname = name[1]
  158. else:
  159. qname = self._current_context[name[0]] + ":" + name[1]
  160. self._out.write('</%s>' % qname)
  161. def characters(self, content):
  162. self._out.write(escape(content))
  163. def ignorableWhitespace(self, content):
  164. self._out.write(content)
  165. def processingInstruction(self, target, data):
  166. self._out.write('<?%s %s?>' % (target, data))
  167. # --- ContentGenerator is the SAX1 DocumentHandler for writing back XML
  168. class ContentGenerator(XMLGenerator):
  169. def characters(self, str, start, end):
  170. # In SAX1, characters receives start and end; in SAX2, it receives
  171. # a string. For plain strings, we may want to use a buffer object.
  172. return XMLGenerator.characters(self, str[start:start+end])
  173. # --- XMLFilterImpl
  174. class XMLFilterBase(xmlreader.XMLReader):
  175. """This class is designed to sit between an XMLReader and the
  176. client application's event handlers. By default, it does nothing
  177. but pass requests up to the reader and events on to the handlers
  178. unmodified, but subclasses can override specific methods to modify
  179. the event stream or the configuration requests as they pass
  180. through."""
  181. # ErrorHandler methods
  182. def error(self, exception):
  183. self._err_handler.error(exception)
  184. def fatalError(self, exception):
  185. self._err_handler.fatalError(exception)
  186. def warning(self, exception):
  187. self._err_handler.warning(exception)
  188. # ContentHandler methods
  189. def setDocumentLocator(self, locator):
  190. self._cont_handler.setDocumentLocator(locator)
  191. def startDocument(self):
  192. self._cont_handler.startDocument()
  193. def endDocument(self):
  194. self._cont_handler.endDocument()
  195. def startPrefixMapping(self, prefix, uri):
  196. self._cont_handler.startPrefixMapping(prefix, uri)
  197. def endPrefixMapping(self, prefix):
  198. self._cont_handler.endPrefixMapping(prefix)
  199. def startElement(self, name, attrs):
  200. self._cont_handler.startElement(name, attrs)
  201. def endElement(self, name):
  202. self._cont_handler.endElement(name)
  203. def startElementNS(self, name, qname, attrs):
  204. self._cont_handler.startElementNS(name, qname, attrs)
  205. def endElementNS(self, name, qname):
  206. self._cont_handler.endElementNS(name, qname)
  207. def characters(self, content):
  208. self._cont_handler.characters(content)
  209. def ignorableWhitespace(self, chars):
  210. self._cont_handler.ignorableWhitespace(chars)
  211. def processingInstruction(self, target, data):
  212. self._cont_handler.processingInstruction(target, data)
  213. def skippedEntity(self, name):
  214. self._cont_handler.skippedEntity(name)
  215. # DTDHandler methods
  216. def notationDecl(self, name, publicId, systemId):
  217. self._dtd_handler.notationDecl(name, publicId, systemId)
  218. def unparsedEntityDecl(self, name, publicId, systemId, ndata):
  219. self._dtd_handler.unparsedEntityDecl(name, publicId, systemId, ndata)
  220. # EntityResolver methods
  221. def resolveEntity(self, publicId, systemId):
  222. self._ent_handler.resolveEntity(publicId, systemId)
  223. # XMLReader methods
  224. def parse(self, source):
  225. self._parent.setContentHandler(self)
  226. self._parent.setErrorHandler(self)
  227. self._parent.setEntityResolver(self)
  228. self._parent.setDTDHandler(self)
  229. self._parent.parse(source)
  230. def setLocale(self, locale):
  231. self._parent.setLocale(locale)
  232. def getFeature(self, name):
  233. return self._parent.getFeature(name)
  234. def setFeature(self, name, state):
  235. self._parent.setFeature(name, state)
  236. def getProperty(self, name):
  237. return self._parent.getProperty(name)
  238. def setProperty(self, name, value):
  239. self._parent.setProperty(name, value)
  240. # FIXME: remove this backward compatibility hack when not needed anymore
  241. XMLFilterImpl = XMLFilterBase
  242. # --- BaseIncrementalParser
  243. class BaseIncrementalParser(xmlreader.IncrementalParser):
  244. """This class implements the parse method of the XMLReader
  245. interface using the feed, close and reset methods of the
  246. IncrementalParser interface as a convenience to SAX 2.0 driver
  247. writers."""
  248. def parse(self, source):
  249. source = prepare_input_source(source)
  250. self.prepareParser(source)
  251. self._cont_handler.startDocument()
  252. # FIXME: what about char-stream?
  253. inf = source.getByteStream()
  254. buffer = inf.read(16384)
  255. while buffer != "":
  256. self.feed(buffer)
  257. buffer = inf.read(16384)
  258. self.close()
  259. self.reset()
  260. self._cont_handler.endDocument()
  261. def prepareParser(self, source):
  262. """This method is called by the parse implementation to allow
  263. the SAX 2.0 driver to prepare itself for parsing."""
  264. raise NotImplementedError("prepareParser must be overridden!")
  265. # --- Utility functions
  266. def prepare_input_source(source, base = ""):
  267. """This function takes an InputSource and an optional base URL and
  268. returns a fully resolved InputSource object ready for reading."""
  269. if type(source) in _StringTypes:
  270. source = xmlreader.InputSource(source)
  271. elif hasattr(source, "read"):
  272. f = source
  273. source = xmlreader.InputSource()
  274. source.setByteStream(f)
  275. if hasattr(f, "name"):
  276. source.setSystemId(f.name)
  277. if source.getByteStream() is None:
  278. sysid = source.getSystemId()
  279. if os.path.isfile(sysid):
  280. basehead = os.path.split(os.path.normpath(base))[0]
  281. source.setSystemId(os.path.join(basehead, sysid))
  282. f = open(sysid, "rb")
  283. else:
  284. source.setSystemId(urlparse.urljoin(base, sysid))
  285. f = urllib.urlopen(source.getSystemId())
  286. source.setByteStream(f)
  287. return source
  288. # ===========================================================================
  289. #
  290. # DEPRECATED SAX 1.0 CLASSES
  291. #
  292. # ===========================================================================
  293. # --- AttributeMap
  294. class AttributeMap:
  295. """An implementation of AttributeList that takes an (attr,val) hash
  296. and uses it to implement the AttributeList interface."""
  297. def __init__(self, map):
  298. self.map=map
  299. def getLength(self):
  300. return len(self.map.keys())
  301. def getName(self, i):
  302. try:
  303. return self.map.keys()[i]
  304. except IndexError,e:
  305. return None
  306. def getType(self, i):
  307. return "CDATA"
  308. def getValue(self, i):
  309. try:
  310. if type(i)==types.IntType:
  311. return self.map[self.getName(i)]
  312. else:
  313. return self.map[i]
  314. except KeyError,e:
  315. return None
  316. def __len__(self):
  317. return len(self.map)
  318. def __getitem__(self, key):
  319. if type(key)==types.IntType:
  320. return self.map.keys()[key]
  321. else:
  322. return self.map[key]
  323. def items(self):
  324. return self.map.items()
  325. def keys(self):
  326. return self.map.keys()
  327. def has_key(self,key):
  328. return self.map.has_key(key)
  329. def get(self, key, alternative=None):
  330. return self.map.get(key, alternative)
  331. def copy(self):
  332. return AttributeMap(self.map.copy())
  333. def values(self):
  334. return self.map.values()
  335. # --- Event broadcasting object
  336. class EventBroadcaster:
  337. """Takes a list of objects and forwards any method calls received
  338. to all objects in the list. The attribute list holds the list and
  339. can freely be modified by clients."""
  340. class Event:
  341. "Helper objects that represent event methods."
  342. def __init__(self,list,name):
  343. self.list=list
  344. self.name=name
  345. def __call__(self,*rest):
  346. for obj in self.list:
  347. apply(getattr(obj,self.name), rest)
  348. def __init__(self,list):
  349. self.list=list
  350. def __getattr__(self,name):
  351. return self.Event(self.list,name)
  352. def __repr__(self):
  353. return "<EventBroadcaster instance at %d>" % id(self)
  354. # --- ESIS document handler
  355. import saxlib
  356. class ESISDocHandler(saxlib.HandlerBase):
  357. "A SAX document handler that produces naive ESIS output."
  358. def __init__(self,writer=sys.stdout):
  359. self.writer=writer
  360. def processingInstruction (self,target, remainder):
  361. """Receive an event signalling that a processing instruction
  362. has been found."""
  363. self.writer.write("?"+target+" "+remainder+"\n")
  364. def startElement(self,name,amap):
  365. "Receive an event signalling the start of an element."
  366. self.writer.write("("+name+"\n")
  367. for a_name in amap.keys():
  368. self.writer.write("A"+a_name+" "+amap[a_name]+"\n")
  369. def endElement(self,name):
  370. "Receive an event signalling the end of an element."
  371. self.writer.write(")"+name+"\n")
  372. def characters(self,data,start_ix,length):
  373. "Receive an event signalling that character data has been found."
  374. self.writer.write("-"+data[start_ix:start_ix+length]+"\n")
  375. # --- XML canonizer
  376. class Canonizer(saxlib.HandlerBase):
  377. "A SAX document handler that produces canonized XML output."
  378. def __init__(self,writer=sys.stdout):
  379. self.elem_level=0
  380. self.writer=writer
  381. def processingInstruction (self,target, remainder):
  382. if not target=="xml":
  383. self.writer.write("<?"+target+" "+remainder+"?>")
  384. def startElement(self,name,amap):
  385. self.writer.write("<"+name)
  386. a_names=amap.keys()
  387. a_names.sort()
  388. for a_name in a_names:
  389. self.writer.write(" "+a_name+"=\"")
  390. self.write_data(amap[a_name])
  391. self.writer.write("\"")
  392. self.writer.write(">")
  393. self.elem_level=self.elem_level+1
  394. def endElement(self,name):
  395. self.writer.write("</"+name+">")
  396. self.elem_level=self.elem_level-1
  397. def ignorableWhitespace(self,data,start_ix,length):
  398. self.characters(data,start_ix,length)
  399. def characters(self,data,start_ix,length):
  400. if self.elem_level>0:
  401. self.write_data(data[start_ix:start_ix+length])
  402. def write_data(self,data):
  403. "Writes datachars to writer."
  404. data=string.replace(data,"&","&amp;")
  405. data=string.replace(data,"<","&lt;")
  406. data=string.replace(data,"\"","&quot;")
  407. data=string.replace(data,">","&gt;")
  408. data=string.replace(data,chr(9),"&#9;")
  409. data=string.replace(data,chr(10),"&#10;")
  410. data=string.replace(data,chr(13),"&#13;")
  411. self.writer.write(data)
  412. # --- mllib
  413. class mllib:
  414. """A re-implementation of the htmllib, sgmllib and xmllib interfaces as a
  415. SAX DocumentHandler."""
  416. # Unsupported:
  417. # - setnomoretags
  418. # - setliteral
  419. # - translate_references
  420. # - handle_xml
  421. # - handle_doctype
  422. # - handle_charref
  423. # - handle_entityref
  424. # - handle_comment
  425. # - handle_cdata
  426. # - tag_attributes
  427. def __init__(self):
  428. self.reset()
  429. def reset(self):
  430. import saxexts # only used here
  431. self.parser=saxexts.XMLParserFactory.make_parser()
  432. self.handler=mllib.Handler(self.parser,self)
  433. self.handler.reset()
  434. def feed(self,data):
  435. self.parser.feed(data)
  436. def close(self):
  437. self.parser.close()
  438. def get_stack(self):
  439. return self.handler.get_stack()
  440. # --- Handler methods (to be overridden)
  441. def handle_starttag(self,name,method,atts):
  442. method(atts)
  443. def handle_endtag(self,name,method):
  444. method()
  445. def handle_data(self,data):
  446. pass
  447. def handle_proc(self,target,data):
  448. pass
  449. def unknown_starttag(self,name,atts):
  450. pass
  451. def unknown_endtag(self,name):
  452. pass
  453. def syntax_error(self,message):
  454. pass
  455. # --- The internal handler class
  456. class Handler(saxlib.DocumentHandler,saxlib.ErrorHandler):
  457. """An internal class to handle SAX events and translate them to mllib
  458. events."""
  459. def __init__(self,driver,handler):
  460. self.driver=driver
  461. self.driver.setDocumentHandler(self)
  462. self.driver.setErrorHandler(self)
  463. self.handler=handler
  464. self.reset()
  465. def get_stack(self):
  466. return self.stack
  467. def reset(self):
  468. self.stack=[]
  469. # --- DocumentHandler methods
  470. def characters(self, ch, start, length):
  471. self.handler.handle_data(ch[start:start+length])
  472. def endElement(self, name):
  473. if hasattr(self.handler,"end_"+name):
  474. self.handler.handle_endtag(name,
  475. getattr(self.handler,"end_"+name))
  476. else:
  477. self.handler.unknown_endtag(name)
  478. del self.stack[-1]
  479. def ignorableWhitespace(self, ch, start, length):
  480. self.handler.handle_data(ch[start:start+length])
  481. def processingInstruction(self, target, data):
  482. self.handler.handle_proc(target,data)
  483. def startElement(self, name, atts):
  484. self.stack.append(name)
  485. if hasattr(self.handler,"start_"+name):
  486. self.handler.handle_starttag(name,
  487. getattr(self.handler,
  488. "start_"+name),
  489. atts)
  490. else:
  491. self.handler.unknown_starttag(name,atts)
  492. # --- ErrorHandler methods
  493. def error(self, exception):
  494. self.handler.syntax_error(str(exception))
  495. def fatalError(self, exception):
  496. raise RuntimeError(str(exception))