You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

sre_parse.py 25KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732
  1. #
  2. # Secret Labs' Regular Expression Engine
  3. #
  4. # convert re-style regular expression to sre pattern
  5. #
  6. # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
  7. #
  8. # See the sre.py file for information on usage and redistribution.
  9. #
  10. # XXX: show string offset and offending character for all errors
  11. # this module works under 1.5.2 and later. don't use string methods
  12. import string, sys
  13. from sre_constants import *
  14. SPECIAL_CHARS = ".\\[{()*+?^$|"
  15. REPEAT_CHARS = "*+?{"
  16. DIGITS = tuple("0123456789")
  17. OCTDIGITS = tuple("01234567")
  18. HEXDIGITS = tuple("0123456789abcdefABCDEF")
  19. WHITESPACE = tuple(" \t\n\r\v\f")
  20. ESCAPES = {
  21. r"\a": (LITERAL, ord("\a")),
  22. r"\b": (LITERAL, ord("\b")),
  23. r"\f": (LITERAL, ord("\f")),
  24. r"\n": (LITERAL, ord("\n")),
  25. r"\r": (LITERAL, ord("\r")),
  26. r"\t": (LITERAL, ord("\t")),
  27. r"\v": (LITERAL, ord("\v")),
  28. r"\\": (LITERAL, ord("\\"))
  29. }
  30. CATEGORIES = {
  31. r"\A": (AT, AT_BEGINNING_STRING), # start of string
  32. r"\b": (AT, AT_BOUNDARY),
  33. r"\B": (AT, AT_NON_BOUNDARY),
  34. r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
  35. r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
  36. r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
  37. r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
  38. r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
  39. r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
  40. r"\Z": (AT, AT_END_STRING), # end of string
  41. }
  42. FLAGS = {
  43. # standard flags
  44. "i": SRE_FLAG_IGNORECASE,
  45. "L": SRE_FLAG_LOCALE,
  46. "m": SRE_FLAG_MULTILINE,
  47. "s": SRE_FLAG_DOTALL,
  48. "x": SRE_FLAG_VERBOSE,
  49. # extensions
  50. "t": SRE_FLAG_TEMPLATE,
  51. "u": SRE_FLAG_UNICODE,
  52. }
  53. # figure out best way to convert hex/octal numbers to integers
  54. try:
  55. int("10", 8)
  56. atoi = int # 2.0 and later
  57. except TypeError:
  58. atoi = string.atoi # 1.5.2
  59. class Pattern:
  60. # master pattern object. keeps track of global attributes
  61. def __init__(self):
  62. self.flags = 0
  63. self.open = []
  64. self.groups = 1
  65. self.groupdict = {}
  66. def opengroup(self, name=None):
  67. gid = self.groups
  68. self.groups = gid + 1
  69. if name:
  70. self.groupdict[name] = gid
  71. self.open.append(gid)
  72. return gid
  73. def closegroup(self, gid):
  74. self.open.remove(gid)
  75. def checkgroup(self, gid):
  76. return gid < self.groups and gid not in self.open
  77. class SubPattern:
  78. # a subpattern, in intermediate form
  79. def __init__(self, pattern, data=None):
  80. self.pattern = pattern
  81. if not data:
  82. data = []
  83. self.data = data
  84. self.width = None
  85. def dump(self, level=0):
  86. nl = 1
  87. for op, av in self.data:
  88. print level*" " + op,; nl = 0
  89. if op == "in":
  90. # member sublanguage
  91. print; nl = 1
  92. for op, a in av:
  93. print (level+1)*" " + op, a
  94. elif op == "branch":
  95. print; nl = 1
  96. i = 0
  97. for a in av[1]:
  98. if i > 0:
  99. print level*" " + "or"
  100. a.dump(level+1); nl = 1
  101. i = i + 1
  102. elif type(av) in (type(()), type([])):
  103. for a in av:
  104. if isinstance(a, SubPattern):
  105. if not nl: print
  106. a.dump(level+1); nl = 1
  107. else:
  108. print a, ; nl = 0
  109. else:
  110. print av, ; nl = 0
  111. if not nl: print
  112. def __repr__(self):
  113. return repr(self.data)
  114. def __len__(self):
  115. return len(self.data)
  116. def __delitem__(self, index):
  117. del self.data[index]
  118. def __getitem__(self, index):
  119. return self.data[index]
  120. def __setitem__(self, index, code):
  121. self.data[index] = code
  122. def __getslice__(self, start, stop):
  123. return SubPattern(self.pattern, self.data[start:stop])
  124. def insert(self, index, code):
  125. self.data.insert(index, code)
  126. def append(self, code):
  127. self.data.append(code)
  128. def getwidth(self):
  129. # determine the width (min, max) for this subpattern
  130. if self.width:
  131. return self.width
  132. lo = hi = 0L
  133. for op, av in self.data:
  134. if op is BRANCH:
  135. i = sys.maxint
  136. j = 0
  137. for av in av[1]:
  138. l, h = av.getwidth()
  139. i = min(i, l)
  140. j = max(j, h)
  141. lo = lo + i
  142. hi = hi + j
  143. elif op is CALL:
  144. i, j = av.getwidth()
  145. lo = lo + i
  146. hi = hi + j
  147. elif op is SUBPATTERN:
  148. i, j = av[1].getwidth()
  149. lo = lo + i
  150. hi = hi + j
  151. elif op in (MIN_REPEAT, MAX_REPEAT):
  152. i, j = av[2].getwidth()
  153. lo = lo + long(i) * av[0]
  154. hi = hi + long(j) * av[1]
  155. elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
  156. lo = lo + 1
  157. hi = hi + 1
  158. elif op == SUCCESS:
  159. break
  160. self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
  161. return self.width
  162. class Tokenizer:
  163. def __init__(self, string):
  164. self.string = string
  165. self.index = 0
  166. self.__next()
  167. def __next(self):
  168. if self.index >= len(self.string):
  169. self.next = None
  170. return
  171. char = self.string[self.index]
  172. if char[0] == "\\":
  173. try:
  174. c = self.string[self.index + 1]
  175. except IndexError:
  176. raise error, "bogus escape"
  177. char = char + c
  178. self.index = self.index + len(char)
  179. self.next = char
  180. def match(self, char, skip=1):
  181. if char == self.next:
  182. if skip:
  183. self.__next()
  184. return 1
  185. return 0
  186. def get(self):
  187. this = self.next
  188. self.__next()
  189. return this
  190. def tell(self):
  191. return self.index, self.next
  192. def seek(self, index):
  193. self.index, self.next = index
  194. def isident(char):
  195. return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
  196. def isdigit(char):
  197. return "0" <= char <= "9"
  198. def isname(name):
  199. # check that group name is a valid string
  200. if not isident(name[0]):
  201. return 0
  202. for char in name:
  203. if not isident(char) and not isdigit(char):
  204. return 0
  205. return 1
  206. def _group(escape, groups):
  207. # check if the escape string represents a valid group
  208. try:
  209. gid = atoi(escape[1:])
  210. if gid and gid < groups:
  211. return gid
  212. except ValueError:
  213. pass
  214. return None # not a valid group
  215. def _class_escape(source, escape):
  216. # handle escape code inside character class
  217. code = ESCAPES.get(escape)
  218. if code:
  219. return code
  220. code = CATEGORIES.get(escape)
  221. if code:
  222. return code
  223. try:
  224. if escape[1:2] == "x":
  225. # hexadecimal escape (exactly two digits)
  226. while source.next in HEXDIGITS and len(escape) < 4:
  227. escape = escape + source.get()
  228. escape = escape[2:]
  229. if len(escape) != 2:
  230. raise error, "bogus escape: %s" % repr("\\" + escape)
  231. return LITERAL, atoi(escape, 16) & 0xff
  232. elif str(escape[1:2]) in OCTDIGITS:
  233. # octal escape (up to three digits)
  234. while source.next in OCTDIGITS and len(escape) < 5:
  235. escape = escape + source.get()
  236. escape = escape[1:]
  237. return LITERAL, atoi(escape, 8) & 0xff
  238. if len(escape) == 2:
  239. return LITERAL, ord(escape[1])
  240. except ValueError:
  241. pass
  242. raise error, "bogus escape: %s" % repr(escape)
  243. def _escape(source, escape, state):
  244. # handle escape code in expression
  245. code = CATEGORIES.get(escape)
  246. if code:
  247. return code
  248. code = ESCAPES.get(escape)
  249. if code:
  250. return code
  251. try:
  252. if escape[1:2] == "x":
  253. # hexadecimal escape
  254. while source.next in HEXDIGITS and len(escape) < 4:
  255. escape = escape + source.get()
  256. if len(escape) != 4:
  257. raise ValueError
  258. return LITERAL, atoi(escape[2:], 16) & 0xff
  259. elif escape[1:2] == "0":
  260. # octal escape
  261. while source.next in OCTDIGITS and len(escape) < 4:
  262. escape = escape + source.get()
  263. return LITERAL, atoi(escape[1:], 8) & 0xff
  264. elif escape[1:2] in DIGITS:
  265. # octal escape *or* decimal group reference (sigh)
  266. here = source.tell()
  267. if source.next in DIGITS:
  268. escape = escape + source.get()
  269. if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
  270. source.next in OCTDIGITS):
  271. # got three octal digits; this is an octal escape
  272. escape = escape + source.get()
  273. return LITERAL, atoi(escape[1:], 8) & 0xff
  274. # got at least one decimal digit; this is a group reference
  275. group = _group(escape, state.groups)
  276. if group:
  277. if not state.checkgroup(group):
  278. raise error, "cannot refer to open group"
  279. return GROUPREF, group
  280. raise ValueError
  281. if len(escape) == 2:
  282. return LITERAL, ord(escape[1])
  283. except ValueError:
  284. pass
  285. raise error, "bogus escape: %s" % repr(escape)
  286. def _parse_sub(source, state, nested=1):
  287. # parse an alternation: a|b|c
  288. items = []
  289. while 1:
  290. items.append(_parse(source, state))
  291. if source.match("|"):
  292. continue
  293. if not nested:
  294. break
  295. if not source.next or source.match(")", 0):
  296. break
  297. else:
  298. raise error, "pattern not properly closed"
  299. if len(items) == 1:
  300. return items[0]
  301. subpattern = SubPattern(state)
  302. # check if all items share a common prefix
  303. while 1:
  304. prefix = None
  305. for item in items:
  306. if not item:
  307. break
  308. if prefix is None:
  309. prefix = item[0]
  310. elif item[0] != prefix:
  311. break
  312. else:
  313. # all subitems start with a common "prefix".
  314. # move it out of the branch
  315. for item in items:
  316. del item[0]
  317. subpattern.append(prefix)
  318. continue # check next one
  319. break
  320. # check if the branch can be replaced by a character set
  321. for item in items:
  322. if len(item) != 1 or item[0][0] != LITERAL:
  323. break
  324. else:
  325. # we can store this as a character set instead of a
  326. # branch (the compiler may optimize this even more)
  327. set = []
  328. for item in items:
  329. set.append(item[0])
  330. subpattern.append((IN, set))
  331. return subpattern
  332. subpattern.append((BRANCH, (None, items)))
  333. return subpattern
  334. def _parse(source, state):
  335. # parse a simple pattern
  336. subpattern = SubPattern(state)
  337. while 1:
  338. if source.next in ("|", ")"):
  339. break # end of subpattern
  340. this = source.get()
  341. if this is None:
  342. break # end of pattern
  343. if state.flags & SRE_FLAG_VERBOSE:
  344. # skip whitespace and comments
  345. if this in WHITESPACE:
  346. continue
  347. if this == "#":
  348. while 1:
  349. this = source.get()
  350. if this in (None, "\n"):
  351. break
  352. continue
  353. if this and this[0] not in SPECIAL_CHARS:
  354. subpattern.append((LITERAL, ord(this)))
  355. elif this == "[":
  356. # character set
  357. set = []
  358. ## if source.match(":"):
  359. ## pass # handle character classes
  360. if source.match("^"):
  361. set.append((NEGATE, None))
  362. # check remaining characters
  363. start = set[:]
  364. while 1:
  365. this = source.get()
  366. if this == "]" and set != start:
  367. break
  368. elif this and this[0] == "\\":
  369. code1 = _class_escape(source, this)
  370. elif this:
  371. code1 = LITERAL, ord(this)
  372. else:
  373. raise error, "unexpected end of regular expression"
  374. if source.match("-"):
  375. # potential range
  376. this = source.get()
  377. if this == "]":
  378. if code1[0] is IN:
  379. code1 = code1[1][0]
  380. set.append(code1)
  381. set.append((LITERAL, ord("-")))
  382. break
  383. else:
  384. if this[0] == "\\":
  385. code2 = _class_escape(source, this)
  386. else:
  387. code2 = LITERAL, ord(this)
  388. if code1[0] != LITERAL or code2[0] != LITERAL:
  389. raise error, "bad character range"
  390. lo = code1[1]
  391. hi = code2[1]
  392. if hi < lo:
  393. raise error, "bad character range"
  394. set.append((RANGE, (lo, hi)))
  395. else:
  396. if code1[0] is IN:
  397. code1 = code1[1][0]
  398. set.append(code1)
  399. # XXX: <fl> should move set optimization to compiler!
  400. if len(set)==1 and set[0][0] is LITERAL:
  401. subpattern.append(set[0]) # optimization
  402. elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
  403. subpattern.append((NOT_LITERAL, set[1][1])) # optimization
  404. else:
  405. # XXX: <fl> should add charmap optimization here
  406. subpattern.append((IN, set))
  407. elif this and this[0] in REPEAT_CHARS:
  408. # repeat previous item
  409. if this == "?":
  410. min, max = 0, 1
  411. elif this == "*":
  412. min, max = 0, MAXREPEAT
  413. elif this == "+":
  414. min, max = 1, MAXREPEAT
  415. elif this == "{":
  416. here = source.tell()
  417. min, max = 0, MAXREPEAT
  418. lo = hi = ""
  419. while source.next in DIGITS:
  420. lo = lo + source.get()
  421. if source.match(","):
  422. while source.next in DIGITS:
  423. hi = hi + source.get()
  424. else:
  425. hi = lo
  426. if not source.match("}"):
  427. subpattern.append((LITERAL, ord(this)))
  428. source.seek(here)
  429. continue
  430. if lo:
  431. min = atoi(lo)
  432. if hi:
  433. max = atoi(hi)
  434. if max < min:
  435. raise error, "bad repeat interval"
  436. else:
  437. raise error, "not supported"
  438. # figure out which item to repeat
  439. if subpattern:
  440. item = subpattern[-1:]
  441. else:
  442. item = None
  443. if not item or (len(item) == 1 and item[0][0] == AT):
  444. raise error, "nothing to repeat"
  445. if item[0][0] in (MIN_REPEAT, MAX_REPEAT):
  446. raise error, "multiple repeat"
  447. if source.match("?"):
  448. subpattern[-1] = (MIN_REPEAT, (min, max, item))
  449. else:
  450. subpattern[-1] = (MAX_REPEAT, (min, max, item))
  451. elif this == ".":
  452. subpattern.append((ANY, None))
  453. elif this == "(":
  454. group = 1
  455. name = None
  456. if source.match("?"):
  457. group = 0
  458. # options
  459. if source.match("P"):
  460. # python extensions
  461. if source.match("<"):
  462. # named group: skip forward to end of name
  463. name = ""
  464. while 1:
  465. char = source.get()
  466. if char is None:
  467. raise error, "unterminated name"
  468. if char == ">":
  469. break
  470. name = name + char
  471. group = 1
  472. if not isname(name):
  473. raise error, "bad character in group name"
  474. elif source.match("="):
  475. # named backreference
  476. name = ""
  477. while 1:
  478. char = source.get()
  479. if char is None:
  480. raise error, "unterminated name"
  481. if char == ")":
  482. break
  483. name = name + char
  484. if not isname(name):
  485. raise error, "bad character in group name"
  486. gid = state.groupdict.get(name)
  487. if gid is None:
  488. raise error, "unknown group name"
  489. subpattern.append((GROUPREF, gid))
  490. continue
  491. else:
  492. char = source.get()
  493. if char is None:
  494. raise error, "unexpected end of pattern"
  495. raise error, "unknown specifier: ?P%s" % char
  496. elif source.match(":"):
  497. # non-capturing group
  498. group = 2
  499. elif source.match("#"):
  500. # comment
  501. while 1:
  502. if source.next is None or source.next == ")":
  503. break
  504. source.get()
  505. if not source.match(")"):
  506. raise error, "unbalanced parenthesis"
  507. continue
  508. elif source.next in ("=", "!", "<"):
  509. # lookahead assertions
  510. char = source.get()
  511. dir = 1
  512. if char == "<":
  513. if source.next not in ("=", "!"):
  514. raise error, "syntax error"
  515. dir = -1 # lookbehind
  516. char = source.get()
  517. p = _parse_sub(source, state)
  518. if not source.match(")"):
  519. raise error, "unbalanced parenthesis"
  520. if char == "=":
  521. subpattern.append((ASSERT, (dir, p)))
  522. else:
  523. subpattern.append((ASSERT_NOT, (dir, p)))
  524. continue
  525. else:
  526. # flags
  527. if not FLAGS.has_key(source.next):
  528. raise error, "unexpected end of pattern"
  529. while FLAGS.has_key(source.next):
  530. state.flags = state.flags | FLAGS[source.get()]
  531. if group:
  532. # parse group contents
  533. if group == 2:
  534. # anonymous group
  535. group = None
  536. else:
  537. group = state.opengroup(name)
  538. p = _parse_sub(source, state)
  539. if not source.match(")"):
  540. raise error, "unbalanced parenthesis"
  541. if group is not None:
  542. state.closegroup(group)
  543. subpattern.append((SUBPATTERN, (group, p)))
  544. else:
  545. while 1:
  546. char = source.get()
  547. if char is None:
  548. raise error, "unexpected end of pattern"
  549. if char == ")":
  550. break
  551. raise error, "unknown extension"
  552. elif this == "^":
  553. subpattern.append((AT, AT_BEGINNING))
  554. elif this == "$":
  555. subpattern.append((AT, AT_END))
  556. elif this and this[0] == "\\":
  557. code = _escape(source, this, state)
  558. subpattern.append(code)
  559. else:
  560. raise error, "parser error"
  561. return subpattern
  562. def parse(str, flags=0, pattern=None):
  563. # parse 're' pattern into list of (opcode, argument) tuples
  564. source = Tokenizer(str)
  565. if pattern is None:
  566. pattern = Pattern()
  567. pattern.flags = flags
  568. pattern.str = str
  569. p = _parse_sub(source, pattern, 0)
  570. tail = source.get()
  571. if tail == ")":
  572. raise error, "unbalanced parenthesis"
  573. elif tail:
  574. raise error, "bogus characters at end of regular expression"
  575. if flags & SRE_FLAG_DEBUG:
  576. p.dump()
  577. if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
  578. # the VERBOSE flag was switched on inside the pattern. to be
  579. # on the safe side, we'll parse the whole thing again...
  580. return parse(str, p.pattern.flags)
  581. return p
  582. def parse_template(source, pattern):
  583. # parse 're' replacement string into list of literals and
  584. # group references
  585. s = Tokenizer(source)
  586. p = []
  587. a = p.append
  588. def literal(literal, p=p):
  589. if p and p[-1][0] is LITERAL:
  590. p[-1] = LITERAL, p[-1][1] + literal
  591. else:
  592. p.append((LITERAL, literal))
  593. sep = source[:0]
  594. if type(sep) is type(""):
  595. char = chr
  596. else:
  597. char = unichr
  598. while 1:
  599. this = s.get()
  600. if this is None:
  601. break # end of replacement string
  602. if this and this[0] == "\\":
  603. # group
  604. if this == "\\g":
  605. name = ""
  606. if s.match("<"):
  607. while 1:
  608. char = s.get()
  609. if char is None:
  610. raise error, "unterminated group name"
  611. if char == ">":
  612. break
  613. name = name + char
  614. if not name:
  615. raise error, "bad group name"
  616. try:
  617. index = atoi(name)
  618. except ValueError:
  619. if not isname(name):
  620. raise error, "bad character in group name"
  621. try:
  622. index = pattern.groupindex[name]
  623. except KeyError:
  624. raise IndexError, "unknown group name"
  625. a((MARK, index))
  626. elif len(this) > 1 and this[1] in DIGITS:
  627. code = None
  628. while 1:
  629. group = _group(this, pattern.groups+1)
  630. if group:
  631. if (s.next not in DIGITS or
  632. not _group(this + s.next, pattern.groups+1)):
  633. code = MARK, group
  634. break
  635. elif s.next in OCTDIGITS:
  636. this = this + s.get()
  637. else:
  638. break
  639. if not code:
  640. this = this[1:]
  641. code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
  642. if code[0] is LITERAL:
  643. literal(code[1])
  644. else:
  645. a(code)
  646. else:
  647. try:
  648. this = char(ESCAPES[this][1])
  649. except KeyError:
  650. pass
  651. literal(this)
  652. else:
  653. literal(this)
  654. # convert template to groups and literals lists
  655. i = 0
  656. groups = []
  657. literals = []
  658. for c, s in p:
  659. if c is MARK:
  660. groups.append((i, s))
  661. literals.append(None)
  662. else:
  663. literals.append(s)
  664. i = i + 1
  665. return groups, literals
  666. def expand_template(template, match):
  667. g = match.group
  668. sep = match.string[:0]
  669. groups, literals = template
  670. literals = literals[:]
  671. try:
  672. for index, group in groups:
  673. literals[index] = s = g(group)
  674. if s is None:
  675. raise IndexError
  676. except IndexError:
  677. raise error, "empty group"
  678. return string.join(literals, sep)