You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

urlparse.py 8.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. """Parse (absolute and relative) URLs.
  2. See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
  3. UC Irvine, June 1995.
  4. """
  5. __all__ = ["urlparse", "urlunparse", "urljoin"]
  6. # A classification of schemes ('' means apply by default)
  7. uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
  8. 'https', 'shttp',
  9. 'prospero', 'rtsp', 'rtspu', '']
  10. uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
  11. 'file',
  12. 'https', 'shttp', 'snews',
  13. 'prospero', 'rtsp', 'rtspu', '']
  14. non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
  15. 'snews', 'sip',
  16. ]
  17. uses_params = ['ftp', 'hdl', 'prospero', 'http',
  18. 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
  19. '']
  20. uses_query = ['http', 'wais',
  21. 'https', 'shttp',
  22. 'gopher', 'rtsp', 'rtspu', 'sip',
  23. '']
  24. uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
  25. 'https', 'shttp', 'snews',
  26. 'file', 'prospero', '']
  27. # Characters valid in scheme names
  28. scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  29. 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  30. '0123456789'
  31. '+-.')
  32. MAX_CACHE_SIZE = 20
  33. _parse_cache = {}
  34. def clear_cache():
  35. """Clear the parse cache."""
  36. global _parse_cache
  37. _parse_cache = {}
  38. def urlparse(url, scheme = '', allow_fragments = 1):
  39. """Parse a URL into 6 components:
  40. <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
  41. Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
  42. Note that we don't break the components up in smaller bits
  43. (e.g. netloc is a single string) and we don't expand % escapes."""
  44. key = url, scheme, allow_fragments
  45. cached = _parse_cache.get(key, None)
  46. if cached:
  47. return cached
  48. if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
  49. clear_cache()
  50. netloc = path = params = query = fragment = ''
  51. i = url.find(':')
  52. if i > 0:
  53. if url[:i] == 'http': # optimize the common case
  54. scheme = url[:i].lower()
  55. url = url[i+1:]
  56. if url[:2] == '//':
  57. i = url.find('/', 2)
  58. if i < 0:
  59. i = len(url)
  60. netloc = url[2:i]
  61. url = url[i:]
  62. if allow_fragments:
  63. i = url.rfind('#')
  64. if i >= 0:
  65. fragment = url[i+1:]
  66. url = url[:i]
  67. i = url.find('?')
  68. if i >= 0:
  69. query = url[i+1:]
  70. url = url[:i]
  71. i = url.find(';')
  72. if i >= 0:
  73. params = url[i+1:]
  74. url = url[:i]
  75. tuple = scheme, netloc, url, params, query, fragment
  76. _parse_cache[key] = tuple
  77. return tuple
  78. for c in url[:i]:
  79. if c not in scheme_chars:
  80. break
  81. else:
  82. scheme, url = url[:i].lower(), url[i+1:]
  83. if scheme in uses_netloc:
  84. if url[:2] == '//':
  85. i = url.find('/', 2)
  86. if i < 0:
  87. i = len(url)
  88. netloc, url = url[2:i], url[i:]
  89. if allow_fragments and scheme in uses_fragment:
  90. i = url.rfind('#')
  91. if i >= 0:
  92. url, fragment = url[:i], url[i+1:]
  93. if scheme in uses_query:
  94. i = url.find('?')
  95. if i >= 0:
  96. url, query = url[:i], url[i+1:]
  97. if scheme in uses_params:
  98. i = url.find(';')
  99. if i >= 0:
  100. url, params = url[:i], url[i+1:]
  101. tuple = scheme, netloc, url, params, query, fragment
  102. _parse_cache[key] = tuple
  103. return tuple
  104. def urlunparse((scheme, netloc, url, params, query, fragment)):
  105. """Put a parsed URL back together again. This may result in a
  106. slightly different, but equivalent URL, if the URL that was parsed
  107. originally had redundant delimiters, e.g. a ? with an empty query
  108. (the draft states that these are equivalent)."""
  109. if netloc or (scheme in uses_netloc and url[:2] == '//'):
  110. if url and url[:1] != '/': url = '/' + url
  111. url = '//' + (netloc or '') + url
  112. if scheme:
  113. url = scheme + ':' + url
  114. if params:
  115. url = url + ';' + params
  116. if query:
  117. url = url + '?' + query
  118. if fragment:
  119. url = url + '#' + fragment
  120. return url
  121. def urljoin(base, url, allow_fragments = 1):
  122. """Join a base URL and a possibly relative URL to form an absolute
  123. interpretation of the latter."""
  124. if not base:
  125. return url
  126. if not url:
  127. return base
  128. bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
  129. urlparse(base, '', allow_fragments)
  130. scheme, netloc, path, params, query, fragment = \
  131. urlparse(url, bscheme, allow_fragments)
  132. if scheme != bscheme or scheme not in uses_relative:
  133. return url
  134. if scheme in uses_netloc:
  135. if netloc:
  136. return urlunparse((scheme, netloc, path,
  137. params, query, fragment))
  138. netloc = bnetloc
  139. if path[:1] == '/':
  140. return urlunparse((scheme, netloc, path,
  141. params, query, fragment))
  142. if not path:
  143. if not params:
  144. params = bparams
  145. if not query:
  146. query = bquery
  147. return urlunparse((scheme, netloc, bpath,
  148. params, query, fragment))
  149. segments = bpath.split('/')[:-1] + path.split('/')
  150. # XXX The stuff below is bogus in various ways...
  151. if segments[-1] == '.':
  152. segments[-1] = ''
  153. while '.' in segments:
  154. segments.remove('.')
  155. while 1:
  156. i = 1
  157. n = len(segments) - 1
  158. while i < n:
  159. if (segments[i] == '..'
  160. and segments[i-1] not in ('', '..')):
  161. del segments[i-1:i+1]
  162. break
  163. i = i+1
  164. else:
  165. break
  166. if segments == ['', '..']:
  167. segments[-1] = ''
  168. elif len(segments) >= 2 and segments[-1] == '..':
  169. segments[-2:] = ['']
  170. return urlunparse((scheme, netloc, '/'.join(segments),
  171. params, query, fragment))
  172. def urldefrag(url):
  173. """Removes any existing fragment from URL.
  174. Returns a tuple of the defragmented URL and the fragment. If
  175. the URL contained no fragments, the second element is the
  176. empty string.
  177. """
  178. s, n, p, a, q, frag = urlparse(url)
  179. defrag = urlunparse((s, n, p, a, q, ''))
  180. return defrag, frag
  181. test_input = """
  182. http://a/b/c/d
  183. g:h = <URL:g:h>
  184. http:g = <URL:http://a/b/c/g>
  185. http: = <URL:http://a/b/c/d>
  186. g = <URL:http://a/b/c/g>
  187. ./g = <URL:http://a/b/c/g>
  188. g/ = <URL:http://a/b/c/g/>
  189. /g = <URL:http://a/g>
  190. //g = <URL:http://g>
  191. ?y = <URL:http://a/b/c/d?y>
  192. g?y = <URL:http://a/b/c/g?y>
  193. g?y/./x = <URL:http://a/b/c/g?y/./x>
  194. . = <URL:http://a/b/c/>
  195. ./ = <URL:http://a/b/c/>
  196. .. = <URL:http://a/b/>
  197. ../ = <URL:http://a/b/>
  198. ../g = <URL:http://a/b/g>
  199. ../.. = <URL:http://a/>
  200. ../../g = <URL:http://a/g>
  201. ../../../g = <URL:http://a/../g>
  202. ./../g = <URL:http://a/b/g>
  203. ./g/. = <URL:http://a/b/c/g/>
  204. /./g = <URL:http://a/./g>
  205. g/./h = <URL:http://a/b/c/g/h>
  206. g/../h = <URL:http://a/b/c/h>
  207. http:g = <URL:http://a/b/c/g>
  208. http: = <URL:http://a/b/c/d>
  209. http:?y = <URL:http://a/b/c/d?y>
  210. http:g?y = <URL:http://a/b/c/g?y>
  211. http:g?y/./x = <URL:http://a/b/c/g?y/./x>
  212. """
  213. # XXX The result for //g is actually http://g/; is this a problem?
  214. def test():
  215. import sys
  216. base = ''
  217. if sys.argv[1:]:
  218. fn = sys.argv[1]
  219. if fn == '-':
  220. fp = sys.stdin
  221. else:
  222. fp = open(fn)
  223. else:
  224. import StringIO
  225. fp = StringIO.StringIO(test_input)
  226. while 1:
  227. line = fp.readline()
  228. if not line: break
  229. words = line.split()
  230. if not words:
  231. continue
  232. url = words[0]
  233. parts = urlparse(url)
  234. print '%-10s : %s' % (url, parts)
  235. abs = urljoin(base, url)
  236. if not base:
  237. base = abs
  238. wrapped = '<URL:%s>' % abs
  239. print '%-10s = %s' % (url, wrapped)
  240. if len(words) == 3 and words[1] == '=':
  241. if wrapped != words[2]:
  242. print 'EXPECTED', words[2], '!!!!!!!!!!'
  243. if __name__ == '__main__':
  244. test()