You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

htmllib.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. """HTML 2.0 parser.
  2. See the HTML 2.0 specification:
  3. http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
  4. """
  5. from sgmllib import SGMLParser
  6. from formatter import AS_IS
  7. __all__ = ["HTMLParser"]
  8. class HTMLParser(SGMLParser):
  9. from htmlentitydefs import entitydefs
  10. def __init__(self, formatter, verbose=0):
  11. SGMLParser.__init__(self, verbose)
  12. self.formatter = formatter
  13. self.savedata = None
  14. self.isindex = 0
  15. self.title = None
  16. self.base = None
  17. self.anchor = None
  18. self.anchorlist = []
  19. self.nofill = 0
  20. self.list_stack = []
  21. # ------ Methods used internally; some may be overridden
  22. # --- Formatter interface, taking care of 'savedata' mode;
  23. # shouldn't need to be overridden
  24. def handle_data(self, data):
  25. if self.savedata is not None:
  26. self.savedata = self.savedata + data
  27. else:
  28. if self.nofill:
  29. self.formatter.add_literal_data(data)
  30. else:
  31. self.formatter.add_flowing_data(data)
  32. # --- Hooks to save data; shouldn't need to be overridden
  33. def save_bgn(self):
  34. self.savedata = ''
  35. def save_end(self):
  36. data = self.savedata
  37. self.savedata = None
  38. if not self.nofill:
  39. data = ' '.join(data.split())
  40. return data
  41. # --- Hooks for anchors; should probably be overridden
  42. def anchor_bgn(self, href, name, type):
  43. self.anchor = href
  44. if self.anchor:
  45. self.anchorlist.append(href)
  46. def anchor_end(self):
  47. if self.anchor:
  48. self.handle_data("[%d]" % len(self.anchorlist))
  49. self.anchor = None
  50. # --- Hook for images; should probably be overridden
  51. def handle_image(self, src, alt, *args):
  52. self.handle_data(alt)
  53. # --------- Top level elememts
  54. def start_html(self, attrs): pass
  55. def end_html(self): pass
  56. def start_head(self, attrs): pass
  57. def end_head(self): pass
  58. def start_body(self, attrs): pass
  59. def end_body(self): pass
  60. # ------ Head elements
  61. def start_title(self, attrs):
  62. self.save_bgn()
  63. def end_title(self):
  64. self.title = self.save_end()
  65. def do_base(self, attrs):
  66. for a, v in attrs:
  67. if a == 'href':
  68. self.base = v
  69. def do_isindex(self, attrs):
  70. self.isindex = 1
  71. def do_link(self, attrs):
  72. pass
  73. def do_meta(self, attrs):
  74. pass
  75. def do_nextid(self, attrs): # Deprecated
  76. pass
  77. # ------ Body elements
  78. # --- Headings
  79. def start_h1(self, attrs):
  80. self.formatter.end_paragraph(1)
  81. self.formatter.push_font(('h1', 0, 1, 0))
  82. def end_h1(self):
  83. self.formatter.end_paragraph(1)
  84. self.formatter.pop_font()
  85. def start_h2(self, attrs):
  86. self.formatter.end_paragraph(1)
  87. self.formatter.push_font(('h2', 0, 1, 0))
  88. def end_h2(self):
  89. self.formatter.end_paragraph(1)
  90. self.formatter.pop_font()
  91. def start_h3(self, attrs):
  92. self.formatter.end_paragraph(1)
  93. self.formatter.push_font(('h3', 0, 1, 0))
  94. def end_h3(self):
  95. self.formatter.end_paragraph(1)
  96. self.formatter.pop_font()
  97. def start_h4(self, attrs):
  98. self.formatter.end_paragraph(1)
  99. self.formatter.push_font(('h4', 0, 1, 0))
  100. def end_h4(self):
  101. self.formatter.end_paragraph(1)
  102. self.formatter.pop_font()
  103. def start_h5(self, attrs):
  104. self.formatter.end_paragraph(1)
  105. self.formatter.push_font(('h5', 0, 1, 0))
  106. def end_h5(self):
  107. self.formatter.end_paragraph(1)
  108. self.formatter.pop_font()
  109. def start_h6(self, attrs):
  110. self.formatter.end_paragraph(1)
  111. self.formatter.push_font(('h6', 0, 1, 0))
  112. def end_h6(self):
  113. self.formatter.end_paragraph(1)
  114. self.formatter.pop_font()
  115. # --- Block Structuring Elements
  116. def do_p(self, attrs):
  117. self.formatter.end_paragraph(1)
  118. def start_pre(self, attrs):
  119. self.formatter.end_paragraph(1)
  120. self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
  121. self.nofill = self.nofill + 1
  122. def end_pre(self):
  123. self.formatter.end_paragraph(1)
  124. self.formatter.pop_font()
  125. self.nofill = max(0, self.nofill - 1)
  126. def start_xmp(self, attrs):
  127. self.start_pre(attrs)
  128. self.setliteral('xmp') # Tell SGML parser
  129. def end_xmp(self):
  130. self.end_pre()
  131. def start_listing(self, attrs):
  132. self.start_pre(attrs)
  133. self.setliteral('listing') # Tell SGML parser
  134. def end_listing(self):
  135. self.end_pre()
  136. def start_address(self, attrs):
  137. self.formatter.end_paragraph(0)
  138. self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
  139. def end_address(self):
  140. self.formatter.end_paragraph(0)
  141. self.formatter.pop_font()
  142. def start_blockquote(self, attrs):
  143. self.formatter.end_paragraph(1)
  144. self.formatter.push_margin('blockquote')
  145. def end_blockquote(self):
  146. self.formatter.end_paragraph(1)
  147. self.formatter.pop_margin()
  148. # --- List Elements
  149. def start_ul(self, attrs):
  150. self.formatter.end_paragraph(not self.list_stack)
  151. self.formatter.push_margin('ul')
  152. self.list_stack.append(['ul', '*', 0])
  153. def end_ul(self):
  154. if self.list_stack: del self.list_stack[-1]
  155. self.formatter.end_paragraph(not self.list_stack)
  156. self.formatter.pop_margin()
  157. def do_li(self, attrs):
  158. self.formatter.end_paragraph(0)
  159. if self.list_stack:
  160. [dummy, label, counter] = top = self.list_stack[-1]
  161. top[2] = counter = counter+1
  162. else:
  163. label, counter = '*', 0
  164. self.formatter.add_label_data(label, counter)
  165. def start_ol(self, attrs):
  166. self.formatter.end_paragraph(not self.list_stack)
  167. self.formatter.push_margin('ol')
  168. label = '1.'
  169. for a, v in attrs:
  170. if a == 'type':
  171. if len(v) == 1: v = v + '.'
  172. label = v
  173. self.list_stack.append(['ol', label, 0])
  174. def end_ol(self):
  175. if self.list_stack: del self.list_stack[-1]
  176. self.formatter.end_paragraph(not self.list_stack)
  177. self.formatter.pop_margin()
  178. def start_menu(self, attrs):
  179. self.start_ul(attrs)
  180. def end_menu(self):
  181. self.end_ul()
  182. def start_dir(self, attrs):
  183. self.start_ul(attrs)
  184. def end_dir(self):
  185. self.end_ul()
  186. def start_dl(self, attrs):
  187. self.formatter.end_paragraph(1)
  188. self.list_stack.append(['dl', '', 0])
  189. def end_dl(self):
  190. self.ddpop(1)
  191. if self.list_stack: del self.list_stack[-1]
  192. def do_dt(self, attrs):
  193. self.ddpop()
  194. def do_dd(self, attrs):
  195. self.ddpop()
  196. self.formatter.push_margin('dd')
  197. self.list_stack.append(['dd', '', 0])
  198. def ddpop(self, bl=0):
  199. self.formatter.end_paragraph(bl)
  200. if self.list_stack:
  201. if self.list_stack[-1][0] == 'dd':
  202. del self.list_stack[-1]
  203. self.formatter.pop_margin()
  204. # --- Phrase Markup
  205. # Idiomatic Elements
  206. def start_cite(self, attrs): self.start_i(attrs)
  207. def end_cite(self): self.end_i()
  208. def start_code(self, attrs): self.start_tt(attrs)
  209. def end_code(self): self.end_tt()
  210. def start_em(self, attrs): self.start_i(attrs)
  211. def end_em(self): self.end_i()
  212. def start_kbd(self, attrs): self.start_tt(attrs)
  213. def end_kbd(self): self.end_tt()
  214. def start_samp(self, attrs): self.start_tt(attrs)
  215. def end_samp(self): self.end_tt()
  216. def start_strong(self, attrs): self.start_b(attrs)
  217. def end_strong(self): self.end_b()
  218. def start_var(self, attrs): self.start_i(attrs)
  219. def end_var(self): self.end_i()
  220. # Typographic Elements
  221. def start_i(self, attrs):
  222. self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
  223. def end_i(self):
  224. self.formatter.pop_font()
  225. def start_b(self, attrs):
  226. self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
  227. def end_b(self):
  228. self.formatter.pop_font()
  229. def start_tt(self, attrs):
  230. self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
  231. def end_tt(self):
  232. self.formatter.pop_font()
  233. def start_a(self, attrs):
  234. href = ''
  235. name = ''
  236. type = ''
  237. for attrname, value in attrs:
  238. value = value.strip()
  239. if attrname == 'href':
  240. href = value
  241. if attrname == 'name':
  242. name = value
  243. if attrname == 'type':
  244. type = value.lower()
  245. self.anchor_bgn(href, name, type)
  246. def end_a(self):
  247. self.anchor_end()
  248. # --- Line Break
  249. def do_br(self, attrs):
  250. self.formatter.add_line_break()
  251. # --- Horizontal Rule
  252. def do_hr(self, attrs):
  253. self.formatter.add_hor_rule()
  254. # --- Image
  255. def do_img(self, attrs):
  256. align = ''
  257. alt = '(image)'
  258. ismap = ''
  259. src = ''
  260. width = 0
  261. height = 0
  262. for attrname, value in attrs:
  263. if attrname == 'align':
  264. align = value
  265. if attrname == 'alt':
  266. alt = value
  267. if attrname == 'ismap':
  268. ismap = value
  269. if attrname == 'src':
  270. src = value
  271. if attrname == 'width':
  272. try: width = int(value)
  273. except: pass
  274. if attrname == 'height':
  275. try: height = int(value)
  276. except: pass
  277. self.handle_image(src, alt, ismap, align, width, height)
  278. # --- Really Old Unofficial Deprecated Stuff
  279. def do_plaintext(self, attrs):
  280. self.start_pre(attrs)
  281. self.setnomoretags() # Tell SGML parser
  282. # --- Unhandled tags
  283. def unknown_starttag(self, tag, attrs):
  284. pass
  285. def unknown_endtag(self, tag):
  286. pass
  287. def test(args = None):
  288. import sys, formatter
  289. if not args:
  290. args = sys.argv[1:]
  291. silent = args and args[0] == '-s'
  292. if silent:
  293. del args[0]
  294. if args:
  295. file = args[0]
  296. else:
  297. file = 'test.html'
  298. if file == '-':
  299. f = sys.stdin
  300. else:
  301. try:
  302. f = open(file, 'r')
  303. except IOError, msg:
  304. print file, ":", msg
  305. sys.exit(1)
  306. data = f.read()
  307. if f is not sys.stdin:
  308. f.close()
  309. if silent:
  310. f = formatter.NullFormatter()
  311. else:
  312. f = formatter.AbstractFormatter(formatter.DumbWriter())
  313. p = HTMLParser(f)
  314. p.feed(data)
  315. p.close()
  316. if __name__ == '__main__':
  317. test()