You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

MyersDiff.java 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. /*
  2. * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
  3. * Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de>
  4. * and other copyright owners as documented in the project's IP log.
  5. *
  6. * This program and the accompanying materials are made available
  7. * under the terms of the Eclipse Distribution License v1.0 which
  8. * accompanies this distribution, is reproduced below, and is
  9. * available at http://www.eclipse.org/org/documents/edl-v10.php
  10. *
  11. * All rights reserved.
  12. *
  13. * Redistribution and use in source and binary forms, with or
  14. * without modification, are permitted provided that the following
  15. * conditions are met:
  16. *
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials provided
  23. * with the distribution.
  24. *
  25. * - Neither the name of the Eclipse Foundation, Inc. nor the
  26. * names of its contributors may be used to endorse or promote
  27. * products derived from this software without specific prior
  28. * written permission.
  29. *
  30. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  31. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  32. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  33. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  35. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  36. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  37. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  38. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  40. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  41. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  42. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43. */
  44. package org.eclipse.jgit.diff;
  45. import java.text.MessageFormat;
  46. import org.eclipse.jgit.JGitText;
  47. import org.eclipse.jgit.util.IntList;
  48. import org.eclipse.jgit.util.LongList;
  49. /**
  50. * Diff algorithm, based on "An O(ND) Difference Algorithm and its
  51. * Variations", by Eugene Myers.
  52. *
  53. * The basic idea is to put the line numbers of text A as columns ("x") and the
  54. * lines of text B as rows ("y"). Now you try to find the shortest "edit path"
  55. * from the upper left corner to the lower right corner, where you can
  56. * always go horizontally or vertically, but diagonally from (x,y) to
  57. * (x+1,y+1) only if line x in text A is identical to line y in text B.
  58. *
  59. * Myers' fundamental concept is the "furthest reaching D-path on diagonal k":
  60. * a D-path is an edit path starting at the upper left corner and containing
  61. * exactly D non-diagonal elements ("differences"). The furthest reaching
  62. * D-path on diagonal k is the one that contains the most (diagonal) elements
  63. * which ends on diagonal k (where k = y - x).
  64. *
  65. * Example:
  66. *
  67. * H E L L O W O R L D
  68. * ____
  69. * L \___
  70. * O \___
  71. * W \________
  72. *
  73. * Since every D-path has exactly D horizontal or vertical elements, it can
  74. * only end on the diagonals -D, -D+2, ..., D-2, D.
  75. *
  76. * Since every furthest reaching D-path contains at least one furthest
  77. * reaching (D-1)-path (except for D=0), we can construct them recursively.
  78. *
  79. * Since we are really interested in the shortest edit path, we can start
  80. * looking for a 0-path, then a 1-path, and so on, until we find a path that
  81. * ends in the lower right corner.
  82. *
  83. * To save space, we do not need to store all paths (which has quadratic space
  84. * requirements), but generate the D-paths simultaneously from both sides.
  85. * When the ends meet, we will have found "the middle" of the path. From the
  86. * end points of that diagonal part, we can generate the rest recursively.
  87. *
  88. * This only requires linear space.
  89. *
  90. * The overall (runtime) complexity is
  91. *
  92. * O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
  93. * = O(N * D^2 * 5 / 4) = O(N * D^2),
  94. *
  95. * (With each step, we have to find the middle parts of twice as many regions
  96. * as before, but the regions (as well as the D) are halved.)
  97. *
  98. * So the overall runtime complexity stays the same with linear space,
  99. * albeit with a larger constant factor.
  100. *
  101. * @param <S>
  102. * type of sequence.
  103. */
  104. public class MyersDiff<S extends Sequence> {
  105. /** Singleton instance of MyersDiff. */
  106. public static final DiffAlgorithm INSTANCE = new LowLevelDiffAlgorithm() {
  107. @Override
  108. public <S extends Sequence> void diffNonCommon(EditList edits,
  109. HashedSequenceComparator<S> cmp, HashedSequence<S> a,
  110. HashedSequence<S> b, Edit region) {
  111. new MyersDiff<S>(edits, cmp, a, b, region);
  112. }
  113. };
  114. /**
  115. * The list of edits found during the last call to
  116. * {@link #calculateEdits(Edit)}
  117. */
  118. protected EditList edits;
  119. /** Comparison function for sequences. */
  120. protected HashedSequenceComparator<S> cmp;
  121. /**
  122. * The first text to be compared. Referred to as "Text A" in the comments
  123. */
  124. protected HashedSequence<S> a;
  125. /**
  126. * The second text to be compared. Referred to as "Text B" in the comments
  127. */
  128. protected HashedSequence<S> b;
  129. private MyersDiff(EditList edits, HashedSequenceComparator<S> cmp,
  130. HashedSequence<S> a, HashedSequence<S> b, Edit region) {
  131. this.edits = edits;
  132. this.cmp = cmp;
  133. this.a = a;
  134. this.b = b;
  135. calculateEdits(region);
  136. }
  137. // TODO: use ThreadLocal for future multi-threaded operations
  138. MiddleEdit middle = new MiddleEdit();
  139. /**
  140. * Entrypoint into the algorithm this class is all about. This method triggers that the
  141. * differences between A and B are calculated in form of a list of edits.
  142. * @param r portion of the sequences to examine.
  143. */
  144. private void calculateEdits(Edit r) {
  145. middle.initialize(r.beginA, r.endA, r.beginB, r.endB);
  146. if (middle.beginA >= middle.endA &&
  147. middle.beginB >= middle.endB)
  148. return;
  149. calculateEdits(middle.beginA, middle.endA,
  150. middle.beginB, middle.endB);
  151. }
  152. /**
  153. * Calculates the differences between a given part of A against another given part of B
  154. * @param beginA start of the part of A which should be compared (0<=beginA<sizeof(A))
  155. * @param endA end of the part of A which should be compared (beginA<=endA<sizeof(A))
  156. * @param beginB start of the part of B which should be compared (0<=beginB<sizeof(B))
  157. * @param endB end of the part of B which should be compared (beginB<=endB<sizeof(B))
  158. */
  159. protected void calculateEdits(int beginA, int endA,
  160. int beginB, int endB) {
  161. Edit edit = middle.calculate(beginA, endA, beginB, endB);
  162. if (beginA < edit.beginA || beginB < edit.beginB) {
  163. int k = edit.beginB - edit.beginA;
  164. int x = middle.backward.snake(k, edit.beginA);
  165. calculateEdits(beginA, x, beginB, k + x);
  166. }
  167. if (edit.getType() != Edit.Type.EMPTY)
  168. edits.add(edits.size(), edit);
  169. // after middle
  170. if (endA > edit.endA || endB > edit.endB) {
  171. int k = edit.endB - edit.endA;
  172. int x = middle.forward.snake(k, edit.endA);
  173. calculateEdits(x, endA, k + x, endB);
  174. }
  175. }
  176. /**
  177. * A class to help bisecting the sequences a and b to find minimal
  178. * edit paths.
  179. *
  180. * As the arrays are reused for space efficiency, you will need one
  181. * instance per thread.
  182. *
  183. * The entry function is the calculate() method.
  184. */
  185. class MiddleEdit {
  186. void initialize(int beginA, int endA, int beginB, int endB) {
  187. this.beginA = beginA; this.endA = endA;
  188. this.beginB = beginB; this.endB = endB;
  189. // strip common parts on either end
  190. int k = beginB - beginA;
  191. this.beginA = forward.snake(k, beginA);
  192. this.beginB = k + this.beginA;
  193. k = endB - endA;
  194. this.endA = backward.snake(k, endA);
  195. this.endB = k + this.endA;
  196. }
  197. /*
  198. * This function calculates the "middle" Edit of the shortest
  199. * edit path between the given subsequences of a and b.
  200. *
  201. * Once a forward path and a backward path meet, we found the
  202. * middle part. From the last snake end point on both of them,
  203. * we construct the Edit.
  204. *
  205. * It is assumed that there is at least one edit in the range.
  206. */
  207. // TODO: measure speed impact when this is synchronized
  208. Edit calculate(int beginA, int endA, int beginB, int endB) {
  209. if (beginA == endA || beginB == endB)
  210. return new Edit(beginA, endA, beginB, endB);
  211. this.beginA = beginA; this.endA = endA;
  212. this.beginB = beginB; this.endB = endB;
  213. /*
  214. * Following the conventions in Myers' paper, "k" is
  215. * the difference between the index into "b" and the
  216. * index into "a".
  217. */
  218. int minK = beginB - endA;
  219. int maxK = endB - beginA;
  220. forward.initialize(beginB - beginA, beginA, minK, maxK);
  221. backward.initialize(endB - endA, endA, minK, maxK);
  222. for (int d = 1; ; d++)
  223. if (forward.calculate(d) ||
  224. backward.calculate(d))
  225. return edit;
  226. }
  227. /*
  228. * For each d, we need to hold the d-paths for the diagonals
  229. * k = -d, -d + 2, ..., d - 2, d. These are stored in the
  230. * forward (and backward) array.
  231. *
  232. * As we allow subsequences, too, this needs some refinement:
  233. * the forward paths start on the diagonal forwardK =
  234. * beginB - beginA, and backward paths start on the diagonal
  235. * backwardK = endB - endA.
  236. *
  237. * So, we need to hold the forward d-paths for the diagonals
  238. * k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
  239. * the analogue for the backward d-paths. This means that
  240. * we can turn (k, d) into the forward array index using this
  241. * formula:
  242. *
  243. * i = (d + k - forwardK) / 2
  244. *
  245. * There is a further complication: the edit paths should not
  246. * leave the specified subsequences, so k is bounded by
  247. * minK = beginB - endA and maxK = endB - beginA. However,
  248. * (k - forwardK) _must_ be odd whenever d is odd, and it
  249. * _must_ be even when d is even.
  250. *
  251. * The values in the "forward" and "backward" arrays are
  252. * positions ("x") in the sequence a, to get the corresponding
  253. * positions ("y") in the sequence b, you have to calculate
  254. * the appropriate k and then y:
  255. *
  256. * k = forwardK - d + i * 2
  257. * y = k + x
  258. *
  259. * (substitute backwardK for forwardK if you want to get the
  260. * y position for an entry in the "backward" array.
  261. */
  262. EditPaths forward = new ForwardEditPaths();
  263. EditPaths backward = new BackwardEditPaths();
  264. /* Some variables which are shared between methods */
  265. protected int beginA, endA, beginB, endB;
  266. protected Edit edit;
  267. abstract class EditPaths {
  268. private IntList x = new IntList();
  269. private LongList snake = new LongList();
  270. int beginK, endK, middleK;
  271. int prevBeginK, prevEndK;
  272. /* if we hit one end early, no need to look further */
  273. int minK, maxK; // TODO: better explanation
  274. final int getIndex(int d, int k) {
  275. // TODO: remove
  276. if (((d + k - middleK) % 2) != 0)
  277. throw new RuntimeException(MessageFormat.format(JGitText.get().unexpectedOddResult, d, k, middleK));
  278. return (d + k - middleK) / 2;
  279. }
  280. final int getX(int d, int k) {
  281. // TODO: remove
  282. if (k < beginK || k > endK)
  283. throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, k, beginK, endK));
  284. return x.get(getIndex(d, k));
  285. }
  286. final long getSnake(int d, int k) {
  287. // TODO: remove
  288. if (k < beginK || k > endK)
  289. throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, k, beginK, endK));
  290. return snake.get(getIndex(d, k));
  291. }
  292. private int forceKIntoRange(int k) {
  293. /* if k is odd, so must be the result */
  294. if (k < minK)
  295. return minK + ((k ^ minK) & 1);
  296. else if (k > maxK)
  297. return maxK - ((k ^ maxK) & 1);
  298. return k;
  299. }
  300. void initialize(int k, int x, int minK, int maxK) {
  301. this.minK = minK;
  302. this.maxK = maxK;
  303. beginK = endK = middleK = k;
  304. this.x.clear();
  305. this.x.add(x);
  306. snake.clear();
  307. snake.add(newSnake(k, x));
  308. }
  309. abstract int snake(int k, int x);
  310. abstract int getLeft(int x);
  311. abstract int getRight(int x);
  312. abstract boolean isBetter(int left, int right);
  313. abstract void adjustMinMaxK(final int k, final int x);
  314. abstract boolean meets(int d, int k, int x, long snake);
  315. final long newSnake(int k, int x) {
  316. long y = k + x;
  317. long ret = ((long) x) << 32;
  318. return ret | y;
  319. }
  320. final int snake2x(long snake) {
  321. return (int) (snake >>> 32);
  322. }
  323. final int snake2y(long snake) {
  324. return (int) snake;
  325. }
  326. final boolean makeEdit(long snake1, long snake2) {
  327. int x1 = snake2x(snake1), x2 = snake2x(snake2);
  328. int y1 = snake2y(snake1), y2 = snake2y(snake2);
  329. /*
  330. * Check for incompatible partial edit paths:
  331. * when there are ambiguities, we might have
  332. * hit incompatible (i.e. non-overlapping)
  333. * forward/backward paths.
  334. *
  335. * In that case, just pretend that we have
  336. * an empty edit at the end of one snake; this
  337. * will force a decision which path to take
  338. * in the next recursion step.
  339. */
  340. if (x1 > x2 || y1 > y2) {
  341. x1 = x2;
  342. y1 = y2;
  343. }
  344. edit = new Edit(x1, x2, y1, y2);
  345. return true;
  346. }
  347. boolean calculate(int d) {
  348. prevBeginK = beginK;
  349. prevEndK = endK;
  350. beginK = forceKIntoRange(middleK - d);
  351. endK = forceKIntoRange(middleK + d);
  352. // TODO: handle i more efficiently
  353. // TODO: walk snake(k, getX(d, k)) only once per (d, k)
  354. // TODO: move end points out of the loop to avoid conditionals inside the loop
  355. // go backwards so that we can avoid temp vars
  356. for (int k = endK; k >= beginK; k -= 2) {
  357. int left = -1, right = -1;
  358. long leftSnake = -1L, rightSnake = -1L;
  359. // TODO: refactor into its own function
  360. if (k > prevBeginK) {
  361. int i = getIndex(d - 1, k - 1);
  362. left = x.get(i);
  363. int end = snake(k - 1, left);
  364. leftSnake = left != end ?
  365. newSnake(k - 1, end) :
  366. snake.get(i);
  367. if (meets(d, k - 1, end, leftSnake))
  368. return true;
  369. left = getLeft(end);
  370. }
  371. if (k < prevEndK) {
  372. int i = getIndex(d - 1, k + 1);
  373. right = x.get(i);
  374. int end = snake(k + 1, right);
  375. rightSnake = right != end ?
  376. newSnake(k + 1, end) :
  377. snake.get(i);
  378. if (meets(d, k + 1, end, rightSnake))
  379. return true;
  380. right = getRight(end);
  381. }
  382. int newX;
  383. long newSnake;
  384. if (k >= prevEndK ||
  385. (k > prevBeginK &&
  386. isBetter(left, right))) {
  387. newX = left;
  388. newSnake = leftSnake;
  389. }
  390. else {
  391. newX = right;
  392. newSnake = rightSnake;
  393. }
  394. if (meets(d, k, newX, newSnake))
  395. return true;
  396. adjustMinMaxK(k, newX);
  397. int i = getIndex(d, k);
  398. x.set(i, newX);
  399. snake.set(i, newSnake);
  400. }
  401. return false;
  402. }
  403. }
  404. class ForwardEditPaths extends EditPaths {
  405. final int snake(int k, int x) {
  406. for (; x < endA && k + x < endB; x++)
  407. if (!cmp.equals(a, x, b, k + x))
  408. break;
  409. return x;
  410. }
  411. final int getLeft(final int x) {
  412. return x;
  413. }
  414. final int getRight(final int x) {
  415. return x + 1;
  416. }
  417. final boolean isBetter(final int left, final int right) {
  418. return left > right;
  419. }
  420. final void adjustMinMaxK(final int k, final int x) {
  421. if (x >= endA || k + x >= endB) {
  422. if (k > backward.middleK)
  423. maxK = k;
  424. else
  425. minK = k;
  426. }
  427. }
  428. final boolean meets(int d, int k, int x, long snake) {
  429. if (k < backward.beginK || k > backward.endK)
  430. return false;
  431. // TODO: move out of loop
  432. if (((d - 1 + k - backward.middleK) % 2) != 0)
  433. return false;
  434. if (x < backward.getX(d - 1, k))
  435. return false;
  436. makeEdit(snake, backward.getSnake(d - 1, k));
  437. return true;
  438. }
  439. }
  440. class BackwardEditPaths extends EditPaths {
  441. final int snake(int k, int x) {
  442. for (; x > beginA && k + x > beginB; x--)
  443. if (!cmp.equals(a, x - 1, b, k + x - 1))
  444. break;
  445. return x;
  446. }
  447. final int getLeft(final int x) {
  448. return x - 1;
  449. }
  450. final int getRight(final int x) {
  451. return x;
  452. }
  453. final boolean isBetter(final int left, final int right) {
  454. return left < right;
  455. }
  456. final void adjustMinMaxK(final int k, final int x) {
  457. if (x <= beginA || k + x <= beginB) {
  458. if (k > forward.middleK)
  459. maxK = k;
  460. else
  461. minK = k;
  462. }
  463. }
  464. final boolean meets(int d, int k, int x, long snake) {
  465. if (k < forward.beginK || k > forward.endK)
  466. return false;
  467. // TODO: move out of loop
  468. if (((d + k - forward.middleK) % 2) != 0)
  469. return false;
  470. if (x > forward.getX(d, k))
  471. return false;
  472. makeEdit(forward.getSnake(d, k), snake);
  473. return true;
  474. }
  475. }
  476. }
  477. /**
  478. * @param args two filenames specifying the contents to be diffed
  479. */
  480. public static void main(String[] args) {
  481. if (args.length != 2) {
  482. System.err.println(JGitText.get().need2Arguments);
  483. System.exit(1);
  484. }
  485. try {
  486. RawText a = new RawText(new java.io.File(args[0]));
  487. RawText b = new RawText(new java.io.File(args[1]));
  488. EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
  489. System.out.println(r.toString());
  490. } catch (Exception e) {
  491. e.printStackTrace();
  492. }
  493. }
  494. }