123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535 |
- /*
- * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de>
- * Copyright (C) 2009, Johannes Schindelin <johannes.schindelin@gmx.de>
- * and other copyright owners as documented in the project's IP log.
- *
- * This program and the accompanying materials are made available
- * under the terms of the Eclipse Distribution License v1.0 which
- * accompanies this distribution, is reproduced below, and is
- * available at http://www.eclipse.org/org/documents/edl-v10.php
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials provided
- * with the distribution.
- *
- * - Neither the name of the Eclipse Foundation, Inc. nor the
- * names of its contributors may be used to endorse or promote
- * products derived from this software without specific prior
- * written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
- * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
- package org.eclipse.jgit.diff;
-
- import org.eclipse.jgit.util.IntList;
- import org.eclipse.jgit.util.LongList;
-
- /**
- * Diff algorithm, based on "An O(ND) Difference Algorithm and its
- * Variations", by Eugene Myers.
- *
- * The basic idea is to put the line numbers of text A as columns ("x") and the
- * lines of text B as rows ("y"). Now you try to find the shortest "edit path"
- * from the upper left corner to the lower right corner, where you can
- * always go horizontally or vertically, but diagonally from (x,y) to
- * (x+1,y+1) only if line x in text A is identical to line y in text B.
- *
- * Myers' fundamental concept is the "furthest reaching D-path on diagonal k":
- * a D-path is an edit path starting at the upper left corner and containing
- * exactly D non-diagonal elements ("differences"). The furthest reaching
- * D-path on diagonal k is the one that contains the most (diagonal) elements
- * which ends on diagonal k (where k = y - x).
- *
- * Example:
- *
- * H E L L O W O R L D
- * ____
- * L \___
- * O \___
- * W \________
- *
- * Since every D-path has exactly D horizontal or vertical elements, it can
- * only end on the diagonals -D, -D+2, ..., D-2, D.
- *
- * Since every furthest reaching D-path contains at least one furthest
- * reaching (D-1)-path (except for D=0), we can construct them recursively.
- *
- * Since we are really interested in the shortest edit path, we can start
- * looking for a 0-path, then a 1-path, and so on, until we find a path that
- * ends in the lower right corner.
- *
- * To save space, we do not need to store all paths (which has quadratic space
- * requirements), but generate the D-paths simultaneously from both sides.
- * When the ends meet, we will have found "the middle" of the path. From the
- * end points of that diagonal part, we can generate the rest recursively.
- *
- * This only requires linear space.
- *
- * The overall (runtime) complexity is
- *
- * O(N * D^2 + 2 * N/2 * (D/2)^2 + 4 * N/4 * (D/4)^2 + ...)
- * = O(N * D^2 * 5 / 4) = O(N * D^2),
- *
- * (With each step, we have to find the middle parts of twice as many regions
- * as before, but the regions (as well as the D) are halved.)
- *
- * So the overall runtime complexity stays the same with linear space,
- * albeit with a larger constant factor.
- */
- public class MyersDiff {
- /**
- * The list of edits found during the last call to {@link #calculateEdits()}
- */
- protected EditList edits;
-
- /**
- * The first text to be compared. Referred to as "Text A" in the comments
- */
- protected Sequence a;
-
- /**
- * The second text to be compared. Referred to as "Text B" in the comments
- */
- protected Sequence b;
-
- /**
- * The only constructor
- *
- * @param a the text A which should be compared
- * @param b the text B which should be compared
- */
- public MyersDiff(Sequence a, Sequence b) {
- this.a = a;
- this.b = b;
- calculateEdits();
- }
-
- /**
- * @return the list of edits found during the last call to {@link #calculateEdits()}
- */
- public EditList getEdits() {
- return edits;
- }
-
- // TODO: use ThreadLocal for future multi-threaded operations
- MiddleEdit middle = new MiddleEdit();
-
- /**
- * Entrypoint into the algorithm this class is all about. This method triggers that the
- * differences between A and B are calculated in form of a list of edits.
- */
- protected void calculateEdits() {
- edits = new EditList();
-
- middle.initialize(0, a.size(), 0, b.size());
- if (middle.beginA >= middle.endA &&
- middle.beginB >= middle.endB)
- return;
-
- calculateEdits(middle.beginA, middle.endA,
- middle.beginB, middle.endB);
- }
-
- /**
- * Calculates the differences between a given part of A against another given part of B
- * @param beginA start of the part of A which should be compared (0<=beginA<sizeof(A))
- * @param endA end of the part of A which should be compared (beginA<=endA<sizeof(A))
- * @param beginB start of the part of B which should be compared (0<=beginB<sizeof(B))
- * @param endB end of the part of B which should be compared (beginB<=endB<sizeof(B))
- */
- protected void calculateEdits(int beginA, int endA,
- int beginB, int endB) {
- Edit edit = middle.calculate(beginA, endA, beginB, endB);
-
- if (beginA < edit.beginA || beginB < edit.beginB) {
- int k = edit.beginB - edit.beginA;
- int x = middle.backward.snake(k, edit.beginA);
- calculateEdits(beginA, x, beginB, k + x);
- }
-
- if (edit.getType() != Edit.Type.EMPTY)
- edits.add(edits.size(), edit);
-
- // after middle
- if (endA > edit.endA || endB > edit.endB) {
- int k = edit.endB - edit.endA;
- int x = middle.forward.snake(k, edit.endA);
- calculateEdits(x, endA, k + x, endB);
- }
- }
-
- /**
- * A class to help bisecting the sequences a and b to find minimal
- * edit paths.
- *
- * As the arrays are reused for space efficiency, you will need one
- * instance per thread.
- *
- * The entry function is the calculate() method.
- */
- class MiddleEdit {
- void initialize(int beginA, int endA, int beginB, int endB) {
- this.beginA = beginA; this.endA = endA;
- this.beginB = beginB; this.endB = endB;
-
- // strip common parts on either end
- int k = beginB - beginA;
- this.beginA = forward.snake(k, beginA);
- this.beginB = k + this.beginA;
-
- k = endB - endA;
- this.endA = backward.snake(k, endA);
- this.endB = k + this.endA;
- }
-
- /*
- * This function calculates the "middle" Edit of the shortest
- * edit path between the given subsequences of a and b.
- *
- * Once a forward path and a backward path meet, we found the
- * middle part. From the last snake end point on both of them,
- * we construct the Edit.
- *
- * It is assumed that there is at least one edit in the range.
- */
- // TODO: measure speed impact when this is synchronized
- Edit calculate(int beginA, int endA, int beginB, int endB) {
- if (beginA == endA || beginB == endB)
- return new Edit(beginA, endA, beginB, endB);
- this.beginA = beginA; this.endA = endA;
- this.beginB = beginB; this.endB = endB;
-
- /*
- * Following the conventions in Myers' paper, "k" is
- * the difference between the index into "b" and the
- * index into "a".
- */
- int minK = beginB - endA;
- int maxK = endB - beginA;
-
- forward.initialize(beginB - beginA, beginA, minK, maxK);
- backward.initialize(endB - endA, endA, minK, maxK);
-
- for (int d = 1; ; d++)
- if (forward.calculate(d) ||
- backward.calculate(d))
- return edit;
- }
-
- /*
- * For each d, we need to hold the d-paths for the diagonals
- * k = -d, -d + 2, ..., d - 2, d. These are stored in the
- * forward (and backward) array.
- *
- * As we allow subsequences, too, this needs some refinement:
- * the forward paths start on the diagonal forwardK =
- * beginB - beginA, and backward paths start on the diagonal
- * backwardK = endB - endA.
- *
- * So, we need to hold the forward d-paths for the diagonals
- * k = forwardK - d, forwardK - d + 2, ..., forwardK + d and
- * the analogue for the backward d-paths. This means that
- * we can turn (k, d) into the forward array index using this
- * formula:
- *
- * i = (d + k - forwardK) / 2
- *
- * There is a further complication: the edit paths should not
- * leave the specified subsequences, so k is bounded by
- * minK = beginB - endA and maxK = endB - beginA. However,
- * (k - forwardK) _must_ be odd whenever d is odd, and it
- * _must_ be even when d is even.
- *
- * The values in the "forward" and "backward" arrays are
- * positions ("x") in the sequence a, to get the corresponding
- * positions ("y") in the sequence b, you have to calculate
- * the appropriate k and then y:
- *
- * k = forwardK - d + i * 2
- * y = k + x
- *
- * (substitute backwardK for forwardK if you want to get the
- * y position for an entry in the "backward" array.
- */
- EditPaths forward = new ForwardEditPaths();
- EditPaths backward = new BackwardEditPaths();
-
- /* Some variables which are shared between methods */
- protected int beginA, endA, beginB, endB;
- protected Edit edit;
-
- abstract class EditPaths {
- private IntList x = new IntList();
- private LongList snake = new LongList();
- int beginK, endK, middleK;
- int prevBeginK, prevEndK;
- /* if we hit one end early, no need to look further */
- int minK, maxK; // TODO: better explanation
-
- final int getIndex(int d, int k) {
- // TODO: remove
- if (((d + k - middleK) % 2) == 1)
- throw new RuntimeException("odd: " + d + " + " + k + " - " + middleK);
- return (d + k - middleK) / 2;
- }
-
- final int getX(int d, int k) {
- // TODO: remove
- if (k < beginK || k > endK)
- throw new RuntimeException("k " + k + " not in " + beginK + " - " + endK);
- return x.get(getIndex(d, k));
- }
-
- final long getSnake(int d, int k) {
- // TODO: remove
- if (k < beginK || k > endK)
- throw new RuntimeException("k " + k + " not in " + beginK + " - " + endK);
- return snake.get(getIndex(d, k));
- }
-
- private int forceKIntoRange(int k) {
- /* if k is odd, so must be the result */
- if (k < minK)
- return minK + ((k ^ minK) & 1);
- else if (k > maxK)
- return maxK - ((k ^ maxK) & 1);
- return k;
- }
-
- void initialize(int k, int x, int minK, int maxK) {
- this.minK = minK;
- this.maxK = maxK;
- beginK = endK = middleK = k;
- this.x.clear();
- this.x.add(x);
- snake.clear();
- snake.add(newSnake(k, x));
- }
-
- abstract int snake(int k, int x);
- abstract int getLeft(int x);
- abstract int getRight(int x);
- abstract boolean isBetter(int left, int right);
- abstract void adjustMinMaxK(final int k, final int x);
- abstract boolean meets(int d, int k, int x, long snake);
-
- final long newSnake(int k, int x) {
- long y = k + x;
- long ret = ((long) x) << 32;
- return ret | y;
- }
-
- final int snake2x(long snake) {
- return (int) (snake >>> 32);
- }
-
- final int snake2y(long snake) {
- return (int) snake;
- }
-
- final boolean makeEdit(long snake1, long snake2) {
- int x1 = snake2x(snake1), x2 = snake2x(snake2);
- int y1 = snake2y(snake1), y2 = snake2y(snake2);
- /*
- * Check for incompatible partial edit paths:
- * when there are ambiguities, we might have
- * hit incompatible (i.e. non-overlapping)
- * forward/backward paths.
- *
- * In that case, just pretend that we have
- * an empty edit at the end of one snake; this
- * will force a decision which path to take
- * in the next recursion step.
- */
- if (x1 > x2 || y1 > y2) {
- x1 = x2;
- y1 = y2;
- }
- edit = new Edit(x1, x2, y1, y2);
- return true;
- }
-
- boolean calculate(int d) {
- prevBeginK = beginK;
- prevEndK = endK;
- beginK = forceKIntoRange(middleK - d);
- endK = forceKIntoRange(middleK + d);
- // TODO: handle i more efficiently
- // TODO: walk snake(k, getX(d, k)) only once per (d, k)
- // TODO: move end points out of the loop to avoid conditionals inside the loop
- // go backwards so that we can avoid temp vars
- for (int k = endK; k >= beginK; k -= 2) {
- int left = -1, right = -1;
- long leftSnake = -1L, rightSnake = -1L;
- // TODO: refactor into its own function
- if (k > prevBeginK) {
- int i = getIndex(d - 1, k - 1);
- left = x.get(i);
- int end = snake(k - 1, left);
- leftSnake = left != end ?
- newSnake(k - 1, end) :
- snake.get(i);
- if (meets(d, k - 1, end, leftSnake))
- return true;
- left = getLeft(end);
- }
- if (k < prevEndK) {
- int i = getIndex(d - 1, k + 1);
- right = x.get(i);
- int end = snake(k + 1, right);
- rightSnake = right != end ?
- newSnake(k + 1, end) :
- snake.get(i);
- if (meets(d, k + 1, end, rightSnake))
- return true;
- right = getRight(end);
- }
- int newX;
- long newSnake;
- if (k >= prevEndK ||
- (k > prevBeginK &&
- isBetter(left, right))) {
- newX = left;
- newSnake = leftSnake;
- }
- else {
- newX = right;
- newSnake = rightSnake;
- }
- if (meets(d, k, newX, newSnake))
- return true;
- adjustMinMaxK(k, newX);
- int i = getIndex(d, k);
- x.set(i, newX);
- snake.set(i, newSnake);
- }
- return false;
- }
- }
-
- class ForwardEditPaths extends EditPaths {
- final int snake(int k, int x) {
- for (; x < endA && k + x < endB; x++)
- if (!a.equals(x, b, k + x))
- break;
- return x;
- }
-
- final int getLeft(final int x) {
- return x;
- }
-
- final int getRight(final int x) {
- return x + 1;
- }
-
- final boolean isBetter(final int left, final int right) {
- return left > right;
- }
-
- final void adjustMinMaxK(final int k, final int x) {
- if (x >= endA || k + x >= endB) {
- if (k > backward.middleK)
- maxK = k;
- else
- minK = k;
- }
- }
-
- final boolean meets(int d, int k, int x, long snake) {
- if (k < backward.beginK || k > backward.endK)
- return false;
- // TODO: move out of loop
- if (((d - 1 + k - backward.middleK) % 2) == 1)
- return false;
- if (x < backward.getX(d - 1, k))
- return false;
- makeEdit(snake, backward.getSnake(d - 1, k));
- return true;
- }
- }
-
- class BackwardEditPaths extends EditPaths {
- final int snake(int k, int x) {
- for (; x > beginA && k + x > beginB; x--)
- if (!a.equals(x - 1, b, k + x - 1))
- break;
- return x;
- }
-
- final int getLeft(final int x) {
- return x - 1;
- }
-
- final int getRight(final int x) {
- return x;
- }
-
- final boolean isBetter(final int left, final int right) {
- return left < right;
- }
-
- final void adjustMinMaxK(final int k, final int x) {
- if (x <= beginA || k + x <= beginB) {
- if (k > forward.middleK)
- maxK = k;
- else
- minK = k;
- }
- }
-
- final boolean meets(int d, int k, int x, long snake) {
- if (k < forward.beginK || k > forward.endK)
- return false;
- // TODO: move out of loop
- if (((d + k - forward.middleK) % 2) == 1)
- return false;
- if (x > forward.getX(d, k))
- return false;
- makeEdit(forward.getSnake(d, k), snake);
- return true;
- }
- }
- }
-
- /**
- * @param args two filenames specifying the contents to be diffed
- */
- public static void main(String[] args) {
- if (args.length != 2) {
- System.err.println("Need 2 arguments");
- System.exit(1);
- }
- try {
- RawText a = new RawText(new java.io.File(args[0]));
- RawText b = new RawText(new java.io.File(args[1]));
- MyersDiff diff = new MyersDiff(a, b);
- System.out.println(diff.getEdits().toString());
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
|