You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

MyersDiff.java 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. /*
  2. * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
  3. * Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de> and others
  4. *
  5. * This program and the accompanying materials are made available under the
  6. * terms of the Eclipse Distribution License v. 1.0 which is available at
  7. * https://www.eclipse.org/org/documents/edl-v10.php.
  8. *
  9. * SPDX-License-Identifier: BSD-3-Clause
  10. */
  11. package org.eclipse.jgit.diff;
  12. import java.text.MessageFormat;
  13. import org.eclipse.jgit.errors.DiffInterruptedException;
  14. import org.eclipse.jgit.internal.JGitText;
  15. import org.eclipse.jgit.util.IntList;
  16. import org.eclipse.jgit.util.LongList;
  17. /**
  18. * Diff algorithm, based on "An O(ND) Difference Algorithm and its Variations",
  19. * by Eugene Myers.
  20. * <p>
  21. * The basic idea is to put the line numbers of text A as columns ("x") and the
  22. * lines of text B as rows ("y"). Now you try to find the shortest "edit path"
  23. * from the upper left corner to the lower right corner, where you can always go
  24. * horizontally or vertically, but diagonally from (x,y) to (x+1,y+1) only if
  25. * line x in text A is identical to line y in text B.
  26. * <p>
  27. * Myers' fundamental concept is the "furthest reaching D-path on diagonal k": a
  28. * D-path is an edit path starting at the upper left corner and containing
  29. * exactly D non-diagonal elements ("differences"). The furthest reaching D-path
  30. * on diagonal k is the one that contains the most (diagonal) elements which
  31. * ends on diagonal k (where k = y - x).
  32. * <p>
  33. * Example:
  34. *
  35. * <pre>
  36. * H E L L O W O R L D
  37. * ____
  38. * L \___
  39. * O \___
  40. * W \________
  41. * </pre>
  42. * <p>
  43. * Since every D-path has exactly D horizontal or vertical elements, it can only
  44. * end on the diagonals -D, -D+2, ..., D-2, D.
  45. * <p>
  46. * Since every furthest reaching D-path contains at least one furthest reaching
  47. * (D-1)-path (except for D=0), we can construct them recursively.
  48. * <p>
  49. * Since we are really interested in the shortest edit path, we can start
  50. * looking for a 0-path, then a 1-path, and so on, until we find a path that
  51. * ends in the lower right corner.
  52. * <p>
  53. * To save space, we do not need to store all paths (which has quadratic space
  54. * requirements), but generate the D-paths simultaneously from both sides. When
  55. * the ends meet, we will have found "the middle" of the path. From the end
  56. * points of that diagonal part, we can generate the rest recursively.
  57. * <p>
  58. * This only requires linear space.
  59. * <p>
  60. * The overall (runtime) complexity is:
  61. *
  62. * <pre>
  63. * O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
  64. * = O(N * D^2 * 5 / 4) = O(N * D^2),
  65. * </pre>
  66. * <p>
  67. * (With each step, we have to find the middle parts of twice as many regions as
  68. * before, but the regions (as well as the D) are halved.)
  69. * <p>
  70. * So the overall runtime complexity stays the same with linear space, albeit
  71. * with a larger constant factor.
  72. *
  73. * @param <S>
  74. * type of sequence.
  75. */
  76. @SuppressWarnings("hiding")
  77. public class MyersDiff<S extends Sequence> {
  78. /** Singleton instance of MyersDiff. */
  79. public static final DiffAlgorithm INSTANCE = new LowLevelDiffAlgorithm() {
  80. @SuppressWarnings("unused")
  81. @Override
  82. public <S extends Sequence> void diffNonCommon(EditList edits,
  83. HashedSequenceComparator<S> cmp, HashedSequence<S> a,
  84. HashedSequence<S> b, Edit region) {
  85. new MyersDiff<>(edits, cmp, a, b, region);
  86. }
  87. };
  88. /**
  89. * The list of edits found during the last call to
  90. * {@link #calculateEdits(Edit)}
  91. */
  92. protected EditList edits;
  93. /** Comparison function for sequences. */
  94. protected HashedSequenceComparator<S> cmp;
  95. /**
  96. * The first text to be compared. Referred to as "Text A" in the comments
  97. */
  98. protected HashedSequence<S> a;
  99. /**
  100. * The second text to be compared. Referred to as "Text B" in the comments
  101. */
  102. protected HashedSequence<S> b;
  103. private MyersDiff(EditList edits, HashedSequenceComparator<S> cmp,
  104. HashedSequence<S> a, HashedSequence<S> b, Edit region) {
  105. this.edits = edits;
  106. this.cmp = cmp;
  107. this.a = a;
  108. this.b = b;
  109. calculateEdits(region);
  110. }
  111. // TODO: use ThreadLocal for future multi-threaded operations
  112. MiddleEdit middle = new MiddleEdit();
  113. /**
  114. * Entrypoint into the algorithm this class is all about. This method triggers that the
  115. * differences between A and B are calculated in form of a list of edits.
  116. * @param r portion of the sequences to examine.
  117. */
  118. private void calculateEdits(Edit r) {
  119. middle.initialize(r.beginA, r.endA, r.beginB, r.endB);
  120. if (middle.beginA >= middle.endA &&
  121. middle.beginB >= middle.endB)
  122. return;
  123. calculateEdits(middle.beginA, middle.endA,
  124. middle.beginB, middle.endB);
  125. }
  126. /**
  127. * Calculates the differences between a given part of A against another
  128. * given part of B
  129. *
  130. * @param beginA
  131. * start of the part of A which should be compared
  132. * (0&lt;=beginA&lt;sizeof(A))
  133. * @param endA
  134. * end of the part of A which should be compared
  135. * (beginA&lt;=endA&lt;sizeof(A))
  136. * @param beginB
  137. * start of the part of B which should be compared
  138. * (0&lt;=beginB&lt;sizeof(B))
  139. * @param endB
  140. * end of the part of B which should be compared
  141. * (beginB&lt;=endB&lt;sizeof(B))
  142. */
  143. protected void calculateEdits(int beginA, int endA,
  144. int beginB, int endB) {
  145. Edit edit = middle.calculate(beginA, endA, beginB, endB);
  146. if (beginA < edit.beginA || beginB < edit.beginB) {
  147. int k = edit.beginB - edit.beginA;
  148. int x = middle.backward.snake(k, edit.beginA);
  149. calculateEdits(beginA, x, beginB, k + x);
  150. }
  151. if (edit.getType() != Edit.Type.EMPTY)
  152. edits.add(edits.size(), edit);
  153. // after middle
  154. if (endA > edit.endA || endB > edit.endB) {
  155. int k = edit.endB - edit.endA;
  156. int x = middle.forward.snake(k, edit.endA);
  157. calculateEdits(x, endA, k + x, endB);
  158. }
  159. }
  160. /**
  161. * A class to help bisecting the sequences a and b to find minimal
  162. * edit paths.
  163. *
  164. * As the arrays are reused for space efficiency, you will need one
  165. * instance per thread.
  166. *
  167. * The entry function is the calculate() method.
  168. */
  169. class MiddleEdit {
  170. void initialize(int beginA, int endA, int beginB, int endB) {
  171. this.beginA = beginA; this.endA = endA;
  172. this.beginB = beginB; this.endB = endB;
  173. // strip common parts on either end
  174. int k = beginB - beginA;
  175. this.beginA = forward.snake(k, beginA);
  176. this.beginB = k + this.beginA;
  177. k = endB - endA;
  178. this.endA = backward.snake(k, endA);
  179. this.endB = k + this.endA;
  180. }
  181. /*
  182. * This function calculates the "middle" Edit of the shortest
  183. * edit path between the given subsequences of a and b.
  184. *
  185. * Once a forward path and a backward path meet, we found the
  186. * middle part. From the last snake end point on both of them,
  187. * we construct the Edit.
  188. *
  189. * It is assumed that there is at least one edit in the range.
  190. */
  191. // TODO: measure speed impact when this is synchronized
  192. Edit calculate(int beginA, int endA, int beginB, int endB) {
  193. if (beginA == endA || beginB == endB)
  194. return new Edit(beginA, endA, beginB, endB);
  195. this.beginA = beginA; this.endA = endA;
  196. this.beginB = beginB; this.endB = endB;
  197. /*
  198. * Following the conventions in Myers' paper, "k" is
  199. * the difference between the index into "b" and the
  200. * index into "a".
  201. */
  202. int minK = beginB - endA;
  203. int maxK = endB - beginA;
  204. forward.initialize(beginB - beginA, beginA, minK, maxK);
  205. backward.initialize(endB - endA, endA, minK, maxK);
  206. for (int d = 1; ; d++)
  207. if (forward.calculate(d) ||
  208. backward.calculate(d))
  209. return edit;
  210. }
  211. /*
  212. * For each d, we need to hold the d-paths for the diagonals
  213. * k = -d, -d + 2, ..., d - 2, d. These are stored in the
  214. * forward (and backward) array.
  215. *
  216. * As we allow subsequences, too, this needs some refinement:
  217. * the forward paths start on the diagonal forwardK =
  218. * beginB - beginA, and backward paths start on the diagonal
  219. * backwardK = endB - endA.
  220. *
  221. * So, we need to hold the forward d-paths for the diagonals
  222. * k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
  223. * the analogue for the backward d-paths. This means that
  224. * we can turn (k, d) into the forward array index using this
  225. * formula:
  226. *
  227. * i = (d + k - forwardK) / 2
  228. *
  229. * There is a further complication: the edit paths should not
  230. * leave the specified subsequences, so k is bounded by
  231. * minK = beginB - endA and maxK = endB - beginA. However,
  232. * (k - forwardK) _must_ be odd whenever d is odd, and it
  233. * _must_ be even when d is even.
  234. *
  235. * The values in the "forward" and "backward" arrays are
  236. * positions ("x") in the sequence a, to get the corresponding
  237. * positions ("y") in the sequence b, you have to calculate
  238. * the appropriate k and then y:
  239. *
  240. * k = forwardK - d + i * 2
  241. * y = k + x
  242. *
  243. * (substitute backwardK for forwardK if you want to get the
  244. * y position for an entry in the "backward" array.
  245. */
  246. EditPaths forward = new ForwardEditPaths();
  247. EditPaths backward = new BackwardEditPaths();
  248. /* Some variables which are shared between methods */
  249. protected int beginA, endA, beginB, endB;
  250. protected Edit edit;
  251. abstract class EditPaths {
  252. private IntList x = new IntList();
  253. private LongList snake = new LongList();
  254. int beginK, endK, middleK;
  255. int prevBeginK, prevEndK;
  256. /* if we hit one end early, no need to look further */
  257. int minK, maxK; // TODO: better explanation
  258. final int getIndex(int d, int k) {
  259. // TODO: remove
  260. if (((d + k - middleK) % 2) != 0)
  261. throw new RuntimeException(MessageFormat.format(JGitText.get().unexpectedOddResult, Integer.valueOf(d), Integer.valueOf(k), Integer.valueOf(middleK)));
  262. return (d + k - middleK) / 2;
  263. }
  264. final int getX(int d, int k) {
  265. // TODO: remove
  266. if (k < beginK || k > endK)
  267. throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
  268. return x.get(getIndex(d, k));
  269. }
  270. final long getSnake(int d, int k) {
  271. // TODO: remove
  272. if (k < beginK || k > endK)
  273. throw new RuntimeException(MessageFormat.format(JGitText.get().kNotInRange, Integer.valueOf(k), Integer.valueOf(beginK), Integer.valueOf(endK)));
  274. return snake.get(getIndex(d, k));
  275. }
  276. private int forceKIntoRange(int k) {
  277. /* if k is odd, so must be the result */
  278. if (k < minK)
  279. return minK + ((k ^ minK) & 1);
  280. else if (k > maxK)
  281. return maxK - ((k ^ maxK) & 1);
  282. return k;
  283. }
  284. void initialize(int k, int x, int minK, int maxK) {
  285. this.minK = minK;
  286. this.maxK = maxK;
  287. beginK = endK = middleK = k;
  288. this.x.clear();
  289. this.x.add(x);
  290. snake.clear();
  291. snake.add(newSnake(k, x));
  292. }
  293. abstract int snake(int k, int x);
  294. abstract int getLeft(int x);
  295. abstract int getRight(int x);
  296. abstract boolean isBetter(int left, int right);
  297. abstract void adjustMinMaxK(int k, int x);
  298. abstract boolean meets(int d, int k, int x, long snake);
  299. final long newSnake(int k, int x) {
  300. long y = k + x;
  301. long ret = ((long) x) << 32;
  302. return ret | y;
  303. }
  304. final int snake2x(long snake) {
  305. return (int) (snake >>> 32);
  306. }
  307. final int snake2y(long snake) {
  308. return (int) snake;
  309. }
  310. final boolean makeEdit(long snake1, long snake2) {
  311. int x1 = snake2x(snake1), x2 = snake2x(snake2);
  312. int y1 = snake2y(snake1), y2 = snake2y(snake2);
  313. /*
  314. * Check for incompatible partial edit paths:
  315. * when there are ambiguities, we might have
  316. * hit incompatible (i.e. non-overlapping)
  317. * forward/backward paths.
  318. *
  319. * In that case, just pretend that we have
  320. * an empty edit at the end of one snake; this
  321. * will force a decision which path to take
  322. * in the next recursion step.
  323. */
  324. if (x1 > x2 || y1 > y2) {
  325. x1 = x2;
  326. y1 = y2;
  327. }
  328. edit = new Edit(x1, x2, y1, y2);
  329. return true;
  330. }
  331. boolean calculate(int d) {
  332. prevBeginK = beginK;
  333. prevEndK = endK;
  334. beginK = forceKIntoRange(middleK - d);
  335. endK = forceKIntoRange(middleK + d);
  336. // TODO: handle i more efficiently
  337. // TODO: walk snake(k, getX(d, k)) only once per (d, k)
  338. // TODO: move end points out of the loop to avoid conditionals inside the loop
  339. // go backwards so that we can avoid temp vars
  340. for (int k = endK; k >= beginK; k -= 2) {
  341. if (Thread.interrupted()) {
  342. throw new DiffInterruptedException();
  343. }
  344. int left = -1, right = -1;
  345. long leftSnake = -1L, rightSnake = -1L;
  346. // TODO: refactor into its own function
  347. if (k > prevBeginK) {
  348. int i = getIndex(d - 1, k - 1);
  349. left = x.get(i);
  350. int end = snake(k - 1, left);
  351. leftSnake = left != end ?
  352. newSnake(k - 1, end) :
  353. snake.get(i);
  354. if (meets(d, k - 1, end, leftSnake))
  355. return true;
  356. left = getLeft(end);
  357. }
  358. if (k < prevEndK) {
  359. int i = getIndex(d - 1, k + 1);
  360. right = x.get(i);
  361. int end = snake(k + 1, right);
  362. rightSnake = right != end ?
  363. newSnake(k + 1, end) :
  364. snake.get(i);
  365. if (meets(d, k + 1, end, rightSnake))
  366. return true;
  367. right = getRight(end);
  368. }
  369. int newX;
  370. long newSnake;
  371. if (k >= prevEndK ||
  372. (k > prevBeginK &&
  373. isBetter(left, right))) {
  374. newX = left;
  375. newSnake = leftSnake;
  376. }
  377. else {
  378. newX = right;
  379. newSnake = rightSnake;
  380. }
  381. if (meets(d, k, newX, newSnake))
  382. return true;
  383. adjustMinMaxK(k, newX);
  384. int i = getIndex(d, k);
  385. x.set(i, newX);
  386. snake.set(i, newSnake);
  387. }
  388. return false;
  389. }
  390. }
  391. class ForwardEditPaths extends EditPaths {
  392. @Override
  393. final int snake(int k, int x) {
  394. for (; x < endA && k + x < endB; x++)
  395. if (!cmp.equals(a, x, b, k + x))
  396. break;
  397. return x;
  398. }
  399. @Override
  400. final int getLeft(int x) {
  401. return x;
  402. }
  403. @Override
  404. final int getRight(int x) {
  405. return x + 1;
  406. }
  407. @Override
  408. final boolean isBetter(int left, int right) {
  409. return left > right;
  410. }
  411. @Override
  412. final void adjustMinMaxK(int k, int x) {
  413. if (x >= endA || k + x >= endB) {
  414. if (k > backward.middleK)
  415. maxK = k;
  416. else
  417. minK = k;
  418. }
  419. }
  420. @Override
  421. final boolean meets(int d, int k, int x, long snake) {
  422. if (k < backward.beginK || k > backward.endK)
  423. return false;
  424. // TODO: move out of loop
  425. if (((d - 1 + k - backward.middleK) % 2) != 0)
  426. return false;
  427. if (x < backward.getX(d - 1, k))
  428. return false;
  429. makeEdit(snake, backward.getSnake(d - 1, k));
  430. return true;
  431. }
  432. }
  433. class BackwardEditPaths extends EditPaths {
  434. @Override
  435. final int snake(int k, int x) {
  436. for (; x > beginA && k + x > beginB; x--)
  437. if (!cmp.equals(a, x - 1, b, k + x - 1))
  438. break;
  439. return x;
  440. }
  441. @Override
  442. final int getLeft(int x) {
  443. return x - 1;
  444. }
  445. @Override
  446. final int getRight(int x) {
  447. return x;
  448. }
  449. @Override
  450. final boolean isBetter(int left, int right) {
  451. return left < right;
  452. }
  453. @Override
  454. final void adjustMinMaxK(int k, int x) {
  455. if (x <= beginA || k + x <= beginB) {
  456. if (k > forward.middleK)
  457. maxK = k;
  458. else
  459. minK = k;
  460. }
  461. }
  462. @Override
  463. final boolean meets(int d, int k, int x, long snake) {
  464. if (k < forward.beginK || k > forward.endK)
  465. return false;
  466. // TODO: move out of loop
  467. if (((d + k - forward.middleK) % 2) != 0)
  468. return false;
  469. if (x > forward.getX(d, k))
  470. return false;
  471. makeEdit(forward.getSnake(d, k), snake);
  472. return true;
  473. }
  474. }
  475. }
  476. /**
  477. * Main method
  478. *
  479. * @param args
  480. * two filenames specifying the contents to be diffed
  481. */
  482. public static void main(String[] args) {
  483. if (args.length != 2) {
  484. System.err.println(JGitText.get().need2Arguments);
  485. System.exit(1);
  486. }
  487. try {
  488. RawText a = new RawText(new java.io.File(args[0]));
  489. RawText b = new RawText(new java.io.File(args[1]));
  490. EditList r = INSTANCE.diff(RawTextComparator.DEFAULT, a, b);
  491. System.out.println(r.toString());
  492. } catch (Exception e) {
  493. e.printStackTrace();
  494. }
  495. }
  496. }