You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenize.py 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. """Tokenization help for Python programs.
  2. This module exports a function called 'tokenize()' that breaks a stream of
  3. text into Python tokens. It accepts a readline-like method which is called
  4. repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
  5. function which is called once for each token found. The latter function is
  6. passed the token type, a string containing the token, the starting and
  7. ending (row, column) coordinates of the token, and the original line. It is
  8. designed to match the working of the Python tokenizer exactly, except that
  9. it produces COMMENT tokens for comments and gives type OP for all operators."""
  10. __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  11. __credits__ = \
  12. 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  13. import string, re
  14. from token import *
  15. import token
  16. __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
  17. del token
  18. COMMENT = N_TOKENS
  19. tok_name[COMMENT] = 'COMMENT'
  20. NL = N_TOKENS + 1
  21. tok_name[NL] = 'NL'
  22. N_TOKENS += 2
  23. def group(*choices): return '(' + '|'.join(choices) + ')'
  24. def any(*choices): return apply(group, choices) + '*'
  25. def maybe(*choices): return apply(group, choices) + '?'
  26. Whitespace = r'[ \f\t]*'
  27. Comment = r'#[^\r\n]*'
  28. Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  29. Name = r'[a-zA-Z_]\w*'
  30. Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  31. Octnumber = r'0[0-7]*[lL]?'
  32. Decnumber = r'[1-9]\d*[lL]?'
  33. Intnumber = group(Hexnumber, Octnumber, Decnumber)
  34. Exponent = r'[eE][-+]?\d+'
  35. Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  36. Expfloat = r'[1-9]\d*' + Exponent
  37. Floatnumber = group(Pointfloat, Expfloat)
  38. Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
  39. Number = group(Imagnumber, Floatnumber, Intnumber)
  40. # Tail end of ' string.
  41. Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  42. # Tail end of " string.
  43. Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  44. # Tail end of ''' string.
  45. Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  46. # Tail end of """ string.
  47. Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  48. Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  49. # Single-line ' or " string.
  50. String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  51. r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  52. # Because of leftmost-then-longest match semantics, be sure to put the
  53. # longest operators first (e.g., if = came before ==, == would get
  54. # recognized as two instances of =).
  55. Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  56. r"[+\-*/%&|^=<>]=?",
  57. r"~")
  58. Bracket = '[][(){}]'
  59. Special = group(r'\r?\n', r'[:;.,`]')
  60. Funny = group(Operator, Bracket, Special)
  61. PlainToken = group(Number, Funny, String, Name)
  62. Token = Ignore + PlainToken
  63. # First (or only) line of ' or " string.
  64. ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  65. group("'", r'\\\r?\n'),
  66. r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  67. group('"', r'\\\r?\n'))
  68. PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  69. PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  70. tokenprog, pseudoprog, single3prog, double3prog = map(
  71. re.compile, (Token, PseudoToken, Single3, Double3))
  72. endprogs = {"'": re.compile(Single), '"': re.compile(Double),
  73. "'''": single3prog, '"""': double3prog,
  74. "r'''": single3prog, 'r"""': double3prog,
  75. "u'''": single3prog, 'u"""': double3prog,
  76. "ur'''": single3prog, 'ur"""': double3prog,
  77. "R'''": single3prog, 'R"""': double3prog,
  78. "U'''": single3prog, 'U"""': double3prog,
  79. "uR'''": single3prog, 'uR"""': double3prog,
  80. "Ur'''": single3prog, 'Ur"""': double3prog,
  81. "UR'''": single3prog, 'UR"""': double3prog,
  82. 'r': None, 'R': None, 'u': None, 'U': None}
  83. tabsize = 8
  84. class TokenError(Exception): pass
  85. class StopTokenizing(Exception): pass
  86. def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
  87. print "%d,%d-%d,%d:\t%s\t%s" % \
  88. (srow, scol, erow, ecol, tok_name[type], repr(token))
  89. def tokenize(readline, tokeneater=printtoken):
  90. try:
  91. tokenize_loop(readline, tokeneater)
  92. except StopTokenizing:
  93. pass
  94. def tokenize_loop(readline, tokeneater):
  95. lnum = parenlev = continued = 0
  96. namechars, numchars = string.letters + '_', string.digits
  97. contstr, needcont = '', 0
  98. contline = None
  99. indents = [0]
  100. while 1: # loop over lines in stream
  101. line = readline()
  102. lnum = lnum + 1
  103. pos, max = 0, len(line)
  104. if contstr: # continued string
  105. if not line:
  106. raise TokenError, ("EOF in multi-line string", strstart)
  107. endmatch = endprog.match(line)
  108. if endmatch:
  109. pos = end = endmatch.end(0)
  110. tokeneater(STRING, contstr + line[:end],
  111. strstart, (lnum, end), contline + line)
  112. contstr, needcont = '', 0
  113. contline = None
  114. elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
  115. tokeneater(ERRORTOKEN, contstr + line,
  116. strstart, (lnum, len(line)), contline)
  117. contstr = ''
  118. contline = None
  119. continue
  120. else:
  121. contstr = contstr + line
  122. contline = contline + line
  123. continue
  124. elif parenlev == 0 and not continued: # new statement
  125. if not line: break
  126. column = 0
  127. while pos < max: # measure leading whitespace
  128. if line[pos] == ' ': column = column + 1
  129. elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
  130. elif line[pos] == '\f': column = 0
  131. else: break
  132. pos = pos + 1
  133. if pos == max: break
  134. if line[pos] in '#\r\n': # skip comments or blank lines
  135. tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
  136. (lnum, pos), (lnum, len(line)), line)
  137. continue
  138. if column > indents[-1]: # count indents or dedents
  139. indents.append(column)
  140. tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
  141. while column < indents[-1]:
  142. indents = indents[:-1]
  143. tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
  144. else: # continued statement
  145. if not line:
  146. raise TokenError, ("EOF in multi-line statement", (lnum, 0))
  147. continued = 0
  148. while pos < max:
  149. pseudomatch = pseudoprog.match(line, pos)
  150. if pseudomatch: # scan for tokens
  151. start, end = pseudomatch.span(1)
  152. spos, epos, pos = (lnum, start), (lnum, end), end
  153. token, initial = line[start:end], line[start]
  154. if initial in numchars or \
  155. (initial == '.' and token != '.'): # ordinary number
  156. tokeneater(NUMBER, token, spos, epos, line)
  157. elif initial in '\r\n':
  158. tokeneater(parenlev > 0 and NL or NEWLINE,
  159. token, spos, epos, line)
  160. elif initial == '#':
  161. tokeneater(COMMENT, token, spos, epos, line)
  162. elif token in ("'''", '"""', # triple-quoted
  163. "r'''", 'r"""', "R'''", 'R"""',
  164. "u'''", 'u"""', "U'''", 'U"""',
  165. "ur'''", 'ur"""', "Ur'''", 'Ur"""',
  166. "uR'''", 'uR"""', "UR'''", 'UR"""'):
  167. endprog = endprogs[token]
  168. endmatch = endprog.match(line, pos)
  169. if endmatch: # all on one line
  170. pos = endmatch.end(0)
  171. token = line[start:pos]
  172. tokeneater(STRING, token, spos, (lnum, pos), line)
  173. else:
  174. strstart = (lnum, start) # multiple lines
  175. contstr = line[start:]
  176. contline = line
  177. break
  178. elif initial in ("'", '"') or \
  179. token[:2] in ("r'", 'r"', "R'", 'R"',
  180. "u'", 'u"', "U'", 'U"') or \
  181. token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
  182. "uR'", 'uR"', "UR'", 'UR"' ):
  183. if token[-1] == '\n': # continued string
  184. strstart = (lnum, start)
  185. endprog = (endprogs[initial] or endprogs[token[1]] or
  186. endprogs[token[2]])
  187. contstr, needcont = line[start:], 1
  188. contline = line
  189. break
  190. else: # ordinary string
  191. tokeneater(STRING, token, spos, epos, line)
  192. elif initial in namechars: # ordinary name
  193. tokeneater(NAME, token, spos, epos, line)
  194. elif initial == '\\': # continued stmt
  195. continued = 1
  196. else:
  197. if initial in '([{': parenlev = parenlev + 1
  198. elif initial in ')]}': parenlev = parenlev - 1
  199. tokeneater(OP, token, spos, epos, line)
  200. else:
  201. tokeneater(ERRORTOKEN, line[pos],
  202. (lnum, pos), (lnum, pos+1), line)
  203. pos = pos + 1
  204. for indent in indents[1:]: # pop remaining indent levels
  205. tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
  206. tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
  207. if __name__ == '__main__': # testing
  208. import sys
  209. if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
  210. else: tokenize(sys.stdin.readline)