You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

difflib.py 29KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781
  1. #! /usr/bin/env python
  2. """
  3. Module difflib -- helpers for computing deltas between objects.
  4. Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
  5. Use SequenceMatcher to return list of the best "good enough" matches.
  6. word is a sequence for which close matches are desired (typically a
  7. string).
  8. possibilities is a list of sequences against which to match word
  9. (typically a list of strings).
  10. Optional arg n (default 3) is the maximum number of close matches to
  11. return. n must be > 0.
  12. Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
  13. that don't score at least that similar to word are ignored.
  14. The best (no more than n) matches among the possibilities are returned
  15. in a list, sorted by similarity score, most similar first.
  16. >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
  17. ['apple', 'ape']
  18. >>> import keyword
  19. >>> get_close_matches("wheel", keyword.kwlist)
  20. ['while']
  21. >>> get_close_matches("apple", keyword.kwlist)
  22. []
  23. >>> get_close_matches("accept", keyword.kwlist)
  24. ['except']
  25. Class SequenceMatcher
  26. SequenceMatcher is a flexible class for comparing pairs of sequences of any
  27. type, so long as the sequence elements are hashable. The basic algorithm
  28. predates, and is a little fancier than, an algorithm published in the late
  29. 1980's by Ratcliff and Obershelp under the hyperbolic name "gestalt pattern
  30. matching". The basic idea is to find the longest contiguous matching
  31. subsequence that contains no "junk" elements (R-O doesn't address junk).
  32. The same idea is then applied recursively to the pieces of the sequences to
  33. the left and to the right of the matching subsequence. This does not yield
  34. minimal edit sequences, but does tend to yield matches that "look right"
  35. to people.
  36. Example, comparing two strings, and considering blanks to be "junk":
  37. >>> s = SequenceMatcher(lambda x: x == " ",
  38. ... "private Thread currentThread;",
  39. ... "private volatile Thread currentThread;")
  40. >>>
  41. .ratio() returns a float in [0, 1], measuring the "similarity" of the
  42. sequences. As a rule of thumb, a .ratio() value over 0.6 means the
  43. sequences are close matches:
  44. >>> print round(s.ratio(), 3)
  45. 0.866
  46. >>>
  47. If you're only interested in where the sequences match,
  48. .get_matching_blocks() is handy:
  49. >>> for block in s.get_matching_blocks():
  50. ... print "a[%d] and b[%d] match for %d elements" % block
  51. a[0] and b[0] match for 8 elements
  52. a[8] and b[17] match for 6 elements
  53. a[14] and b[23] match for 15 elements
  54. a[29] and b[38] match for 0 elements
  55. Note that the last tuple returned by .get_matching_blocks() is always a
  56. dummy, (len(a), len(b), 0), and this is the only case in which the last
  57. tuple element (number of elements matched) is 0.
  58. If you want to know how to change the first sequence into the second, use
  59. .get_opcodes():
  60. >>> for opcode in s.get_opcodes():
  61. ... print "%6s a[%d:%d] b[%d:%d]" % opcode
  62. equal a[0:8] b[0:8]
  63. insert a[8:8] b[8:17]
  64. equal a[8:14] b[17:23]
  65. equal a[14:29] b[23:38]
  66. See Tools/scripts/ndiff.py for a fancy human-friendly file differencer,
  67. which uses SequenceMatcher both to view files as sequences of lines, and
  68. lines as sequences of characters.
  69. See also function get_close_matches() in this module, which shows how
  70. simple code building on SequenceMatcher can be used to do useful work.
  71. Timing: Basic R-O is cubic time worst case and quadratic time expected
  72. case. SequenceMatcher is quadratic time for the worst case and has
  73. expected-case behavior dependent in a complicated way on how many
  74. elements the sequences have in common; best case time is linear.
  75. SequenceMatcher methods:
  76. __init__(isjunk=None, a='', b='')
  77. Construct a SequenceMatcher.
  78. Optional arg isjunk is None (the default), or a one-argument function
  79. that takes a sequence element and returns true iff the element is junk.
  80. None is equivalent to passing "lambda x: 0", i.e. no elements are
  81. considered to be junk. For example, pass
  82. lambda x: x in " \\t"
  83. if you're comparing lines as sequences of characters, and don't want to
  84. synch up on blanks or hard tabs.
  85. Optional arg a is the first of two sequences to be compared. By
  86. default, an empty string. The elements of a must be hashable.
  87. Optional arg b is the second of two sequences to be compared. By
  88. default, an empty string. The elements of b must be hashable.
  89. set_seqs(a, b)
  90. Set the two sequences to be compared.
  91. >>> s = SequenceMatcher()
  92. >>> s.set_seqs("abcd", "bcde")
  93. >>> s.ratio()
  94. 0.75
  95. set_seq1(a)
  96. Set the first sequence to be compared.
  97. The second sequence to be compared is not changed.
  98. >>> s = SequenceMatcher(None, "abcd", "bcde")
  99. >>> s.ratio()
  100. 0.75
  101. >>> s.set_seq1("bcde")
  102. >>> s.ratio()
  103. 1.0
  104. >>>
  105. SequenceMatcher computes and caches detailed information about the
  106. second sequence, so if you want to compare one sequence S against many
  107. sequences, use .set_seq2(S) once and call .set_seq1(x) repeatedly for
  108. each of the other sequences.
  109. See also set_seqs() and set_seq2().
  110. set_seq2(b)
  111. Set the second sequence to be compared.
  112. The first sequence to be compared is not changed.
  113. >>> s = SequenceMatcher(None, "abcd", "bcde")
  114. >>> s.ratio()
  115. 0.75
  116. >>> s.set_seq2("abcd")
  117. >>> s.ratio()
  118. 1.0
  119. >>>
  120. SequenceMatcher computes and caches detailed information about the
  121. second sequence, so if you want to compare one sequence S against many
  122. sequences, use .set_seq2(S) once and call .set_seq1(x) repeatedly for
  123. each of the other sequences.
  124. See also set_seqs() and set_seq1().
  125. find_longest_match(alo, ahi, blo, bhi)
  126. Find longest matching block in a[alo:ahi] and b[blo:bhi].
  127. If isjunk is not defined:
  128. Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
  129. alo <= i <= i+k <= ahi
  130. blo <= j <= j+k <= bhi
  131. and for all (i',j',k') meeting those conditions,
  132. k >= k'
  133. i <= i'
  134. and if i == i', j <= j'
  135. In other words, of all maximal matching blocks, return one that starts
  136. earliest in a, and of all those maximal matching blocks that start
  137. earliest in a, return the one that starts earliest in b.
  138. >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
  139. >>> s.find_longest_match(0, 5, 0, 9)
  140. (0, 4, 5)
  141. If isjunk is defined, first the longest matching block is determined as
  142. above, but with the additional restriction that no junk element appears
  143. in the block. Then that block is extended as far as possible by
  144. matching (only) junk elements on both sides. So the resulting block
  145. never matches on junk except as identical junk happens to be adjacent
  146. to an "interesting" match.
  147. Here's the same example as before, but considering blanks to be junk.
  148. That prevents " abcd" from matching the " abcd" at the tail end of the
  149. second sequence directly. Instead only the "abcd" can match, and
  150. matches the leftmost "abcd" in the second sequence:
  151. >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
  152. >>> s.find_longest_match(0, 5, 0, 9)
  153. (1, 0, 4)
  154. If no blocks match, return (alo, blo, 0).
  155. >>> s = SequenceMatcher(None, "ab", "c")
  156. >>> s.find_longest_match(0, 2, 0, 1)
  157. (0, 0, 0)
  158. get_matching_blocks()
  159. Return list of triples describing matching subsequences.
  160. Each triple is of the form (i, j, n), and means that
  161. a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in i
  162. and in j.
  163. The last triple is a dummy, (len(a), len(b), 0), and is the only triple
  164. with n==0.
  165. >>> s = SequenceMatcher(None, "abxcd", "abcd")
  166. >>> s.get_matching_blocks()
  167. [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
  168. get_opcodes()
  169. Return list of 5-tuples describing how to turn a into b.
  170. Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple has
  171. i1 == j1 == 0, and remaining tuples have i1 == the i2 from the tuple
  172. preceding it, and likewise for j1 == the previous j2.
  173. The tags are strings, with these meanings:
  174. 'replace': a[i1:i2] should be replaced by b[j1:j2]
  175. 'delete': a[i1:i2] should be deleted.
  176. Note that j1==j2 in this case.
  177. 'insert': b[j1:j2] should be inserted at a[i1:i1].
  178. Note that i1==i2 in this case.
  179. 'equal': a[i1:i2] == b[j1:j2]
  180. >>> a = "qabxcd"
  181. >>> b = "abycdf"
  182. >>> s = SequenceMatcher(None, a, b)
  183. >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
  184. ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
  185. ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
  186. delete a[0:1] (q) b[0:0] ()
  187. equal a[1:3] (ab) b[0:2] (ab)
  188. replace a[3:4] (x) b[2:3] (y)
  189. equal a[4:6] (cd) b[3:5] (cd)
  190. insert a[6:6] () b[5:6] (f)
  191. ratio()
  192. Return a measure of the sequences' similarity (float in [0,1]).
  193. Where T is the total number of elements in both sequences, and M is the
  194. number of matches, this is 2,0*M / T. Note that this is 1 if the
  195. sequences are identical, and 0 if they have nothing in common.
  196. .ratio() is expensive to compute if you haven't already computed
  197. .get_matching_blocks() or .get_opcodes(), in which case you may want to
  198. try .quick_ratio() or .real_quick_ratio() first to get an upper bound.
  199. >>> s = SequenceMatcher(None, "abcd", "bcde")
  200. >>> s.ratio()
  201. 0.75
  202. >>> s.quick_ratio()
  203. 0.75
  204. >>> s.real_quick_ratio()
  205. 1.0
  206. quick_ratio()
  207. Return an upper bound on .ratio() relatively quickly.
  208. This isn't defined beyond that it is an upper bound on .ratio(), and
  209. is faster to compute.
  210. real_quick_ratio():
  211. Return an upper bound on ratio() very quickly.
  212. This isn't defined beyond that it is an upper bound on .ratio(), and
  213. is faster to compute than either .ratio() or .quick_ratio().
  214. """
  215. TRACE = 0
  216. class SequenceMatcher:
  217. def __init__(self, isjunk=None, a='', b=''):
  218. """Construct a SequenceMatcher.
  219. Optional arg isjunk is None (the default), or a one-argument
  220. function that takes a sequence element and returns true iff the
  221. element is junk. None is equivalent to passing "lambda x: 0", i.e.
  222. no elements are considered to be junk. For example, pass
  223. lambda x: x in " \\t"
  224. if you're comparing lines as sequences of characters, and don't
  225. want to synch up on blanks or hard tabs.
  226. Optional arg a is the first of two sequences to be compared. By
  227. default, an empty string. The elements of a must be hashable. See
  228. also .set_seqs() and .set_seq1().
  229. Optional arg b is the second of two sequences to be compared. By
  230. default, an empty string. The elements of b must be hashable. See
  231. also .set_seqs() and .set_seq2().
  232. """
  233. # Members:
  234. # a
  235. # first sequence
  236. # b
  237. # second sequence; differences are computed as "what do
  238. # we need to do to 'a' to change it into 'b'?"
  239. # b2j
  240. # for x in b, b2j[x] is a list of the indices (into b)
  241. # at which x appears; junk elements do not appear
  242. # b2jhas
  243. # b2j.has_key
  244. # fullbcount
  245. # for x in b, fullbcount[x] == the number of times x
  246. # appears in b; only materialized if really needed (used
  247. # only for computing quick_ratio())
  248. # matching_blocks
  249. # a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
  250. # ascending & non-overlapping in i and in j; terminated by
  251. # a dummy (len(a), len(b), 0) sentinel
  252. # opcodes
  253. # a list of (tag, i1, i2, j1, j2) tuples, where tag is
  254. # one of
  255. # 'replace' a[i1:i2] should be replaced by b[j1:j2]
  256. # 'delete' a[i1:i2] should be deleted
  257. # 'insert' b[j1:j2] should be inserted
  258. # 'equal' a[i1:i2] == b[j1:j2]
  259. # isjunk
  260. # a user-supplied function taking a sequence element and
  261. # returning true iff the element is "junk" -- this has
  262. # subtle but helpful effects on the algorithm, which I'll
  263. # get around to writing up someday <0.9 wink>.
  264. # DON'T USE! Only __chain_b uses this. Use isbjunk.
  265. # isbjunk
  266. # for x in b, isbjunk(x) == isjunk(x) but much faster;
  267. # it's really the has_key method of a hidden dict.
  268. # DOES NOT WORK for x in a!
  269. self.isjunk = isjunk
  270. self.a = self.b = None
  271. self.set_seqs(a, b)
  272. def set_seqs(self, a, b):
  273. """Set the two sequences to be compared.
  274. >>> s = SequenceMatcher()
  275. >>> s.set_seqs("abcd", "bcde")
  276. >>> s.ratio()
  277. 0.75
  278. """
  279. self.set_seq1(a)
  280. self.set_seq2(b)
  281. def set_seq1(self, a):
  282. """Set the first sequence to be compared.
  283. The second sequence to be compared is not changed.
  284. >>> s = SequenceMatcher(None, "abcd", "bcde")
  285. >>> s.ratio()
  286. 0.75
  287. >>> s.set_seq1("bcde")
  288. >>> s.ratio()
  289. 1.0
  290. >>>
  291. SequenceMatcher computes and caches detailed information about the
  292. second sequence, so if you want to compare one sequence S against
  293. many sequences, use .set_seq2(S) once and call .set_seq1(x)
  294. repeatedly for each of the other sequences.
  295. See also set_seqs() and set_seq2().
  296. """
  297. if a is self.a:
  298. return
  299. self.a = a
  300. self.matching_blocks = self.opcodes = None
  301. def set_seq2(self, b):
  302. """Set the second sequence to be compared.
  303. The first sequence to be compared is not changed.
  304. >>> s = SequenceMatcher(None, "abcd", "bcde")
  305. >>> s.ratio()
  306. 0.75
  307. >>> s.set_seq2("abcd")
  308. >>> s.ratio()
  309. 1.0
  310. >>>
  311. SequenceMatcher computes and caches detailed information about the
  312. second sequence, so if you want to compare one sequence S against
  313. many sequences, use .set_seq2(S) once and call .set_seq1(x)
  314. repeatedly for each of the other sequences.
  315. See also set_seqs() and set_seq1().
  316. """
  317. if b is self.b:
  318. return
  319. self.b = b
  320. self.matching_blocks = self.opcodes = None
  321. self.fullbcount = None
  322. self.__chain_b()
  323. # For each element x in b, set b2j[x] to a list of the indices in
  324. # b where x appears; the indices are in increasing order; note that
  325. # the number of times x appears in b is len(b2j[x]) ...
  326. # when self.isjunk is defined, junk elements don't show up in this
  327. # map at all, which stops the central find_longest_match method
  328. # from starting any matching block at a junk element ...
  329. # also creates the fast isbjunk function ...
  330. # note that this is only called when b changes; so for cross-product
  331. # kinds of matches, it's best to call set_seq2 once, then set_seq1
  332. # repeatedly
  333. def __chain_b(self):
  334. # Because isjunk is a user-defined (not C) function, and we test
  335. # for junk a LOT, it's important to minimize the number of calls.
  336. # Before the tricks described here, __chain_b was by far the most
  337. # time-consuming routine in the whole module! If anyone sees
  338. # Jim Roskind, thank him again for profile.py -- I never would
  339. # have guessed that.
  340. # The first trick is to build b2j ignoring the possibility
  341. # of junk. I.e., we don't call isjunk at all yet. Throwing
  342. # out the junk later is much cheaper than building b2j "right"
  343. # from the start.
  344. b = self.b
  345. self.b2j = b2j = {}
  346. self.b2jhas = b2jhas = b2j.has_key
  347. for i in xrange(len(b)):
  348. elt = b[i]
  349. if b2jhas(elt):
  350. b2j[elt].append(i)
  351. else:
  352. b2j[elt] = [i]
  353. # Now b2j.keys() contains elements uniquely, and especially when
  354. # the sequence is a string, that's usually a good deal smaller
  355. # than len(string). The difference is the number of isjunk calls
  356. # saved.
  357. isjunk, junkdict = self.isjunk, {}
  358. if isjunk:
  359. for elt in b2j.keys():
  360. if isjunk(elt):
  361. junkdict[elt] = 1 # value irrelevant; it's a set
  362. del b2j[elt]
  363. # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
  364. # latter is much faster. Note too that while there may be a
  365. # lot of junk in the sequence, the number of *unique* junk
  366. # elements is probably small. So the memory burden of keeping
  367. # this dict alive is likely trivial compared to the size of b2j.
  368. self.isbjunk = junkdict.has_key
  369. def find_longest_match(self, alo, ahi, blo, bhi):
  370. """Find longest matching block in a[alo:ahi] and b[blo:bhi].
  371. If isjunk is not defined:
  372. Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
  373. alo <= i <= i+k <= ahi
  374. blo <= j <= j+k <= bhi
  375. and for all (i',j',k') meeting those conditions,
  376. k >= k'
  377. i <= i'
  378. and if i == i', j <= j'
  379. In other words, of all maximal matching blocks, return one that
  380. starts earliest in a, and of all those maximal matching blocks that
  381. start earliest in a, return the one that starts earliest in b.
  382. >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
  383. >>> s.find_longest_match(0, 5, 0, 9)
  384. (0, 4, 5)
  385. If isjunk is defined, first the longest matching block is
  386. determined as above, but with the additional restriction that no
  387. junk element appears in the block. Then that block is extended as
  388. far as possible by matching (only) junk elements on both sides. So
  389. the resulting block never matches on junk except as identical junk
  390. happens to be adjacent to an "interesting" match.
  391. Here's the same example as before, but considering blanks to be
  392. junk. That prevents " abcd" from matching the " abcd" at the tail
  393. end of the second sequence directly. Instead only the "abcd" can
  394. match, and matches the leftmost "abcd" in the second sequence:
  395. >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
  396. >>> s.find_longest_match(0, 5, 0, 9)
  397. (1, 0, 4)
  398. If no blocks match, return (alo, blo, 0).
  399. >>> s = SequenceMatcher(None, "ab", "c")
  400. >>> s.find_longest_match(0, 2, 0, 1)
  401. (0, 0, 0)
  402. """
  403. # CAUTION: stripping common prefix or suffix would be incorrect.
  404. # E.g.,
  405. # ab
  406. # acab
  407. # Longest matching block is "ab", but if common prefix is
  408. # stripped, it's "a" (tied with "b"). UNIX(tm) diff does so
  409. # strip, so ends up claiming that ab is changed to acab by
  410. # inserting "ca" in the middle. That's minimal but unintuitive:
  411. # "it's obvious" that someone inserted "ac" at the front.
  412. # Windiff ends up at the same place as diff, but by pairing up
  413. # the unique 'b's and then matching the first two 'a's.
  414. a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
  415. besti, bestj, bestsize = alo, blo, 0
  416. # find longest junk-free match
  417. # during an iteration of the loop, j2len[j] = length of longest
  418. # junk-free match ending with a[i-1] and b[j]
  419. j2len = {}
  420. nothing = []
  421. for i in xrange(alo, ahi):
  422. # look at all instances of a[i] in b; note that because
  423. # b2j has no junk keys, the loop is skipped if a[i] is junk
  424. j2lenget = j2len.get
  425. newj2len = {}
  426. for j in b2j.get(a[i], nothing):
  427. # a[i] matches b[j]
  428. if j < blo:
  429. continue
  430. if j >= bhi:
  431. break
  432. k = newj2len[j] = j2lenget(j-1, 0) + 1
  433. if k > bestsize:
  434. besti, bestj, bestsize = i-k+1, j-k+1, k
  435. j2len = newj2len
  436. # Now that we have a wholly interesting match (albeit possibly
  437. # empty!), we may as well suck up the matching junk on each
  438. # side of it too. Can't think of a good reason not to, and it
  439. # saves post-processing the (possibly considerable) expense of
  440. # figuring out what to do with it. In the case of an empty
  441. # interesting match, this is clearly the right thing to do,
  442. # because no other kind of match is possible in the regions.
  443. while besti > alo and bestj > blo and \
  444. isbjunk(b[bestj-1]) and \
  445. a[besti-1] == b[bestj-1]:
  446. besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
  447. while besti+bestsize < ahi and bestj+bestsize < bhi and \
  448. isbjunk(b[bestj+bestsize]) and \
  449. a[besti+bestsize] == b[bestj+bestsize]:
  450. bestsize = bestsize + 1
  451. if TRACE:
  452. print "get_matching_blocks", alo, ahi, blo, bhi
  453. print " returns", besti, bestj, bestsize
  454. return besti, bestj, bestsize
  455. def get_matching_blocks(self):
  456. """Return list of triples describing matching subsequences.
  457. Each triple is of the form (i, j, n), and means that
  458. a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in
  459. i and in j.
  460. The last triple is a dummy, (len(a), len(b), 0), and is the only
  461. triple with n==0.
  462. >>> s = SequenceMatcher(None, "abxcd", "abcd")
  463. >>> s.get_matching_blocks()
  464. [(0, 0, 2), (3, 2, 2), (5, 4, 0)]
  465. """
  466. if self.matching_blocks is not None:
  467. return self.matching_blocks
  468. self.matching_blocks = []
  469. la, lb = len(self.a), len(self.b)
  470. self.__helper(0, la, 0, lb, self.matching_blocks)
  471. self.matching_blocks.append( (la, lb, 0) )
  472. if TRACE:
  473. print '*** matching blocks', self.matching_blocks
  474. return self.matching_blocks
  475. # builds list of matching blocks covering a[alo:ahi] and
  476. # b[blo:bhi], appending them in increasing order to answer
  477. def __helper(self, alo, ahi, blo, bhi, answer):
  478. i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
  479. # a[alo:i] vs b[blo:j] unknown
  480. # a[i:i+k] same as b[j:j+k]
  481. # a[i+k:ahi] vs b[j+k:bhi] unknown
  482. if k:
  483. if alo < i and blo < j:
  484. self.__helper(alo, i, blo, j, answer)
  485. answer.append(x)
  486. if i+k < ahi and j+k < bhi:
  487. self.__helper(i+k, ahi, j+k, bhi, answer)
  488. def get_opcodes(self):
  489. """Return list of 5-tuples describing how to turn a into b.
  490. Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple
  491. has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
  492. tuple preceding it, and likewise for j1 == the previous j2.
  493. The tags are strings, with these meanings:
  494. 'replace': a[i1:i2] should be replaced by b[j1:j2]
  495. 'delete': a[i1:i2] should be deleted.
  496. Note that j1==j2 in this case.
  497. 'insert': b[j1:j2] should be inserted at a[i1:i1].
  498. Note that i1==i2 in this case.
  499. 'equal': a[i1:i2] == b[j1:j2]
  500. >>> a = "qabxcd"
  501. >>> b = "abycdf"
  502. >>> s = SequenceMatcher(None, a, b)
  503. >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
  504. ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
  505. ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))
  506. delete a[0:1] (q) b[0:0] ()
  507. equal a[1:3] (ab) b[0:2] (ab)
  508. replace a[3:4] (x) b[2:3] (y)
  509. equal a[4:6] (cd) b[3:5] (cd)
  510. insert a[6:6] () b[5:6] (f)
  511. """
  512. if self.opcodes is not None:
  513. return self.opcodes
  514. i = j = 0
  515. self.opcodes = answer = []
  516. for ai, bj, size in self.get_matching_blocks():
  517. # invariant: we've pumped out correct diffs to change
  518. # a[:i] into b[:j], and the next matching block is
  519. # a[ai:ai+size] == b[bj:bj+size]. So we need to pump
  520. # out a diff to change a[i:ai] into b[j:bj], pump out
  521. # the matching block, and move (i,j) beyond the match
  522. tag = ''
  523. if i < ai and j < bj:
  524. tag = 'replace'
  525. elif i < ai:
  526. tag = 'delete'
  527. elif j < bj:
  528. tag = 'insert'
  529. if tag:
  530. answer.append( (tag, i, ai, j, bj) )
  531. i, j = ai+size, bj+size
  532. # the list of matching blocks is terminated by a
  533. # sentinel with size 0
  534. if size:
  535. answer.append( ('equal', ai, i, bj, j) )
  536. return answer
  537. def ratio(self):
  538. """Return a measure of the sequences' similarity (float in [0,1]).
  539. Where T is the total number of elements in both sequences, and
  540. M is the number of matches, this is 2,0*M / T.
  541. Note that this is 1 if the sequences are identical, and 0 if
  542. they have nothing in common.
  543. .ratio() is expensive to compute if you haven't already computed
  544. .get_matching_blocks() or .get_opcodes(), in which case you may
  545. want to try .quick_ratio() or .real_quick_ratio() first to get an
  546. upper bound.
  547. >>> s = SequenceMatcher(None, "abcd", "bcde")
  548. >>> s.ratio()
  549. 0.75
  550. >>> s.quick_ratio()
  551. 0.75
  552. >>> s.real_quick_ratio()
  553. 1.0
  554. """
  555. matches = reduce(lambda sum, triple: sum + triple[-1],
  556. self.get_matching_blocks(), 0)
  557. return 2.0 * matches / (len(self.a) + len(self.b))
  558. def quick_ratio(self):
  559. """Return an upper bound on ratio() relatively quickly.
  560. This isn't defined beyond that it is an upper bound on .ratio(), and
  561. is faster to compute.
  562. """
  563. # viewing a and b as multisets, set matches to the cardinality
  564. # of their intersection; this counts the number of matches
  565. # without regard to order, so is clearly an upper bound
  566. if self.fullbcount is None:
  567. self.fullbcount = fullbcount = {}
  568. for elt in self.b:
  569. fullbcount[elt] = fullbcount.get(elt, 0) + 1
  570. fullbcount = self.fullbcount
  571. # avail[x] is the number of times x appears in 'b' less the
  572. # number of times we've seen it in 'a' so far ... kinda
  573. avail = {}
  574. availhas, matches = avail.has_key, 0
  575. for elt in self.a:
  576. if availhas(elt):
  577. numb = avail[elt]
  578. else:
  579. numb = fullbcount.get(elt, 0)
  580. avail[elt] = numb - 1
  581. if numb > 0:
  582. matches = matches + 1
  583. return 2.0 * matches / (len(self.a) + len(self.b))
  584. def real_quick_ratio(self):
  585. """Return an upper bound on ratio() very quickly.
  586. This isn't defined beyond that it is an upper bound on .ratio(), and
  587. is faster to compute than either .ratio() or .quick_ratio().
  588. """
  589. la, lb = len(self.a), len(self.b)
  590. # can't have more matches than the number of elements in the
  591. # shorter sequence
  592. return 2.0 * min(la, lb) / (la + lb)
  593. def get_close_matches(word, possibilities, n=3, cutoff=0.6):
  594. """Use SequenceMatcher to return list of the best "good enough" matches.
  595. word is a sequence for which close matches are desired (typically a
  596. string).
  597. possibilities is a list of sequences against which to match word
  598. (typically a list of strings).
  599. Optional arg n (default 3) is the maximum number of close matches to
  600. return. n must be > 0.
  601. Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities
  602. that don't score at least that similar to word are ignored.
  603. The best (no more than n) matches among the possibilities are returned
  604. in a list, sorted by similarity score, most similar first.
  605. >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
  606. ['apple', 'ape']
  607. >>> import keyword
  608. >>> get_close_matches("wheel", keyword.kwlist)
  609. ['while']
  610. >>> get_close_matches("apple", keyword.kwlist)
  611. []
  612. >>> get_close_matches("accept", keyword.kwlist)
  613. ['except']
  614. """
  615. if not n > 0:
  616. raise ValueError("n must be > 0: " + `n`)
  617. if not 0.0 <= cutoff <= 1.0:
  618. raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)
  619. result = []
  620. s = SequenceMatcher()
  621. s.set_seq2(word)
  622. for x in possibilities:
  623. s.set_seq1(x)
  624. if s.real_quick_ratio() >= cutoff and \
  625. s.quick_ratio() >= cutoff and \
  626. s.ratio() >= cutoff:
  627. result.append((s.ratio(), x))
  628. # Sort by score.
  629. result.sort()
  630. # Retain only the best n.
  631. result = result[-n:]
  632. # Move best-scorer to head of list.
  633. result.reverse()
  634. # Strip scores.
  635. return [x for score, x in result]
  636. def _test():
  637. import doctest, difflib
  638. return doctest.testmod(difflib)
  639. if __name__ == "__main__":
  640. _test()