]> source.dussan.org Git - jgit.git/commitdiff
Break dissimilar file pairs during diff 70/1170/7
authorJeff Schumacher <jeffschu@google.com>
Thu, 22 Jul 2010 19:55:28 +0000 (12:55 -0700)
committerShawn O. Pearce <spearce@spearce.org>
Wed, 28 Jul 2010 01:13:32 +0000 (18:13 -0700)
File pairs that are very dissimilar during a diff were not being
broken apart into their constituent ADD/DELETE pairs. The leads to
sub-optimal rename detection. Take, for example, this situation:

A file exists at src/a.txt containing "foo". A user renames src/a.txt
to src/b.txt, then adds a new src/a.txt containing "bar".

Even though the old a.txt and the new b.txt are identical, the
rename detection algorithm would not detect it as a rename since
it was already paired in a MODIFY. I added code to split all
MODIFYs below a certain score into their constituent ADD/DELETE
pairs. This allows situations like the one I described above to be
more correctly handled.

Change-Id: I22c04b70581f206bbc68c4cd1ee87a1f663b418e
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties
org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java

index bfc51b68f9382871744f3556456c6afdfba227cc..eb965cf601a28995f2eaf5528f069da1cd576c68 100644 (file)
@@ -383,6 +383,116 @@ public class RenameDetectorTest extends RepositoryTestCase {
                assertSame(b, entries.get(1));
        }
 
+       public void testBreakModify_BreakAll() throws Exception {
+               ObjectId aId = blob("foo");
+               ObjectId bId = blob("bar");
+
+               DiffEntry m = DiffEntry.modify(PATH_A);
+               m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+               m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+               DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+               rd.add(a);
+               rd.add(m);
+
+               rd.setBreakScore(101);
+
+               List<DiffEntry> entries = rd.compute();
+               assertEquals(2, entries.size());
+               assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0));
+               assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1));
+       }
+
+       public void testBreakModify_BreakNone() throws Exception {
+               ObjectId aId = blob("foo");
+               ObjectId bId = blob("bar");
+
+               DiffEntry m = DiffEntry.modify(PATH_A);
+               m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+               m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+               DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+               rd.add(a);
+               rd.add(m);
+
+               rd.setBreakScore(-1);
+
+               List<DiffEntry> entries = rd.compute();
+               assertEquals(2, entries.size());
+               assertSame(m, entries.get(0));
+               assertSame(a, entries.get(1));
+       }
+
+       public void testBreakModify_BreakBelowScore() throws Exception {
+               ObjectId aId = blob("foo");
+               ObjectId bId = blob("bar");
+
+               DiffEntry m = DiffEntry.modify(PATH_A);
+               m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+               m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+               DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+               rd.add(a);
+               rd.add(m);
+
+               rd.setBreakScore(20); // Should break the modify
+
+               List<DiffEntry> entries = rd.compute();
+               assertEquals(2, entries.size());
+               assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0));
+               assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1));
+       }
+
+       public void testBreakModify_DontBreakAboveScore() throws Exception {
+               ObjectId aId = blob("blah\nblah\nfoo");
+               ObjectId bId = blob("blah\nblah\nbar");
+
+               DiffEntry m = DiffEntry.modify(PATH_A);
+               m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+               m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+               DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+               rd.add(a);
+               rd.add(m);
+
+               rd.setBreakScore(20); // Should not break the modify
+
+               List<DiffEntry> entries = rd.compute();
+               assertEquals(2, entries.size());
+               assertSame(m, entries.get(0));
+               assertSame(a, entries.get(1));
+       }
+
+       public void testBreakModify_RejoinIfUnpaired() throws Exception {
+               ObjectId aId = blob("foo");
+               ObjectId bId = blob("bar");
+
+               DiffEntry m = DiffEntry.modify(PATH_A);
+               m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+               m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+               rd.add(m);
+
+               rd.setBreakScore(101); // Ensure m is broken apart
+
+               List<DiffEntry> entries = rd.compute();
+               assertEquals(1, entries.size());
+
+               DiffEntry modify = entries.get(0);
+               assertEquals(m.oldName, modify.oldName);
+               assertEquals(m.oldId, modify.oldId);
+               assertEquals(m.oldMode, modify.oldMode);
+               assertEquals(m.newName, modify.newName);
+               assertEquals(m.newId, modify.newId);
+               assertEquals(m.newMode, modify.newMode);
+               assertEquals(m.changeType, modify.changeType);
+               assertEquals(0, modify.score);
+       }
+
        public void testSetRenameScore_IllegalArgs() throws Exception {
                try {
                        rd.setRenameScore(-1);
@@ -462,4 +572,15 @@ public class RenameDetectorTest extends RepositoryTestCase {
 
                assertEquals(score, copy.getScore());
        }
+
+       private static void assertAdd(String newName, ObjectId newId,
+                       FileMode newMode, DiffEntry add) {
+               assertEquals(DiffEntry.DEV_NULL, add.oldName);
+               assertEquals(DiffEntry.A_ZERO, add.oldId);
+               assertEquals(FileMode.MISSING, add.oldMode);
+               assertEquals(ChangeType.ADD, add.changeType);
+               assertEquals(newName, add.newName);
+               assertEquals(AbbreviatedObjectId.fromObjectId(newId), add.newId);
+               assertEquals(newMode, add.newMode);
+       }
 }
index 88c6c8e3a4457c5f51cc36bf6c5fc54f2ef6fac9..1b2b81fce3ed7909ca811c989d49a5fd0fc77990 100644 (file)
@@ -302,8 +302,10 @@ remoteDoesNotSupportSmartHTTPPush=remote does not support smart HTTP push
 remoteHungUpUnexpectedly=remote hung up unexpectedly
 remoteNameCantBeNull=Remote name can't be null.
 renamesAlreadyFound=Renames have already been found.
+renamesBreakingModifies=Breaking apart modified file pairs
 renamesFindingByContent=Finding renames by content similarity
 renamesFindingExact=Finding exact renames
+renamesRejoiningModifies=Rejoining modified file pairs
 repositoryAlreadyExists=Repository already exists: {0}
 repositoryConfigFileInvalid=Repository config file {0} invalid {1}
 repositoryIsRequired=Repository is required.
index 9df9887af4aed2e68627d52ad088bfc93f9609fb..9d1e2cd808c391701cfbbb2291b9d2acce04361b 100644 (file)
@@ -361,8 +361,10 @@ public class JGitText extends TranslationBundle {
        /***/ public String remoteHungUpUnexpectedly;
        /***/ public String remoteNameCantBeNull;
        /***/ public String renamesAlreadyFound;
+       /***/ public String renamesBreakingModifies;
        /***/ public String renamesFindingByContent;
        /***/ public String renamesFindingExact;
+       /***/ public String renamesRejoiningModifies;
        /***/ public String repositoryAlreadyExists;
        /***/ public String repositoryConfigFileInvalid;
        /***/ public String repositoryIsRequired;
index 304e4cff39a4d95230753b75804800597a4afb2a..7dbcdaa6d9438297f50a1ca2a1e9f49239e95aaa 100644 (file)
@@ -57,7 +57,8 @@ import org.eclipse.jgit.treewalk.TreeWalk;
 
 /** A value class representing a change to a file */
 public class DiffEntry {
-       private static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId
+       /** Magical SHA1 used for file adds or deletes */
+       static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId
                        .fromObjectId(ObjectId.zeroId());
 
        /** Magical file name used for file adds or deletes. */
@@ -171,6 +172,15 @@ public class DiffEntry {
                return e;
        }
 
+       /**
+        * Breaks apart a DiffEntry into two entries, one DELETE and one ADD.
+        *
+        * @param entry
+        *            the DiffEntry to break apart.
+        * @return a list containing two entries. Calling {@link #getChangeType()}
+        *         on the first entry will return ChangeType.DELETE. Calling it on
+        *         the second entry will return ChangeType.ADD.
+        */
        static List<DiffEntry> breakModify(DiffEntry entry) {
                DiffEntry del = new DiffEntry();
                del.oldId = entry.getOldId();
index 835cf6f865415adddb18a0667ec897834a38afbe..4ee742ae166561db983b61268cddace3694eb29a 100644 (file)
@@ -290,6 +290,14 @@ public class DiffFormatter {
                                o.write('\n');
                        }
                        break;
+               case MODIFY:
+                       int score = ent.getScore();
+                       if (0 < score && score <= 100) {
+                               o.write(encodeASCII("dissimilarity index " + (100 - score)
+                                               + "%"));
+                               o.write('\n');
+                       }
+                       break;
                }
 
                switch (ent.getChangeType()) {
index a3203e349faf0554548d4643ac58878a3c6483d2..fedd7cda15c11a5b0292864ef90c7375c9a95a68 100644 (file)
@@ -55,6 +55,7 @@ import java.util.List;
 import org.eclipse.jgit.JGitText;
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;
 import org.eclipse.jgit.lib.AbbreviatedObjectId;
+import org.eclipse.jgit.lib.Constants;
 import org.eclipse.jgit.lib.FileMode;
 import org.eclipse.jgit.lib.NullProgressMonitor;
 import org.eclipse.jgit.lib.ObjectReader;
@@ -99,7 +100,7 @@ public class RenameDetector {
                }
        };
 
-       private final List<DiffEntry> entries = new ArrayList<DiffEntry>();
+       private List<DiffEntry> entries = new ArrayList<DiffEntry>();
 
        private List<DiffEntry> deleted = new ArrayList<DiffEntry>();
 
@@ -112,6 +113,13 @@ public class RenameDetector {
        /** Similarity score required to pair an add/delete as a rename. */
        private int renameScore = 60;
 
+       /**
+        * Similarity score required to keep modified file pairs together. Any
+        * modified file pairs with a similarity score below this will be broken
+        * apart.
+        */
+       private int breakScore = -1;
+
        /** Limit in the number of files to consider for renames. */
        private int renameLimit;
 
@@ -159,6 +167,29 @@ public class RenameDetector {
                renameScore = score;
        }
 
+       /**
+        * @return the similarity score required to keep modified file pairs
+        *         together. Any modify pairs that score below this will be broken
+        *         apart into separate add/deletes. Values less than or equal to
+        *         zero indicate that no modifies will be broken apart. Values over
+        *         100 cause all modify pairs to be broken.
+        */
+       public int getBreakScore() {
+               return breakScore;
+       }
+
+       /**
+        * @param breakScore
+        *            the similarity score required to keep modified file pairs
+        *            together. Any modify pairs that score below this will be
+        *            broken apart into separate add/deletes. Values less than or
+        *            equal to zero indicate that no modifies will be broken apart.
+        *            Values over 100 cause all modify pairs to be broken.
+        */
+       public void setBreakScore(int breakScore) {
+               this.breakScore = breakScore;
+       }
+
        /** @return limit on number of paths to perform inexact rename detection. */
        public int getRenameLimit() {
                return renameLimit;
@@ -219,10 +250,13 @@ public class RenameDetector {
                                break;
 
                        case MODIFY:
-                               if (sameType(entry.getOldMode(), entry.getNewMode()))
+                               if (sameType(entry.getOldMode(), entry.getNewMode())) {
                                        entries.add(entry);
-                               else
-                                       entries.addAll(DiffEntry.breakModify(entry));
+                               } else {
+                                       List<DiffEntry> tmp = DiffEntry.breakModify(entry);
+                                       deleted.add(tmp.get(0));
+                                       added.add(tmp.get(1));
+                               }
                                break;
 
                        case COPY:
@@ -275,8 +309,15 @@ public class RenameDetector {
 
                        if (pm == null)
                                pm = NullProgressMonitor.INSTANCE;
-                       findExactRenames(pm);
-                       findContentRenames(pm);
+                       ObjectReader reader = repo.newObjectReader();
+                       try {
+                               breakModifies(reader, pm);
+                               findExactRenames(pm);
+                               findContentRenames(reader, pm);
+                               rejoinModifies(pm);
+                       } finally {
+                               reader.release();
+                       }
 
                        entries.addAll(added);
                        added = null;
@@ -289,25 +330,96 @@ public class RenameDetector {
                return Collections.unmodifiableList(entries);
        }
 
-       private void findContentRenames(ProgressMonitor pm) throws IOException {
+       private void breakModifies(ObjectReader reader, ProgressMonitor pm)
+                       throws IOException {
+               if (breakScore <= 0)
+                       return;
+
+               ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size());
+
+               pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
+
+               for (int i = 0; i < entries.size(); i++) {
+                       DiffEntry e = entries.get(i);
+                       if (e.getChangeType() == ChangeType.MODIFY) {
+                               int score = calculateModifyScore(reader, e);
+                               if (score < breakScore) {
+                                       List<DiffEntry> tmp = DiffEntry.breakModify(e);
+                                       DiffEntry del = tmp.get(0);
+                                       del.score = score;
+                                       deleted.add(del);
+                                       added.add(tmp.get(1));
+                               } else {
+                                       newEntries.add(e);
+                               }
+                       } else {
+                               newEntries.add(e);
+                       }
+                       pm.update(1);
+               }
+
+               entries = newEntries;
+       }
+
+       private void rejoinModifies(ProgressMonitor pm) {
+               HashMap<String, DiffEntry> nameMap = new HashMap<String, DiffEntry>();
+               ArrayList<DiffEntry> newAdded = new ArrayList<DiffEntry>(added.size());
+
+               pm.beginTask(JGitText.get().renamesRejoiningModifies, added.size()
+                               + deleted.size());
+
+               for (DiffEntry src : deleted) {
+                       nameMap.put(src.oldName, src);
+                       pm.update(1);
+               }
+
+               for (DiffEntry dst : added) {
+                       DiffEntry src = nameMap.remove(dst.newName);
+                       if (src != null) {
+                               if (sameType(src.oldMode, dst.newMode)) {
+                                       entries.add(DiffEntry.pair(ChangeType.MODIFY, src, dst,
+                                                       src.score));
+                               } else {
+                                       nameMap.put(src.oldName, src);
+                                       newAdded.add(dst);
+                               }
+                       } else {
+                               newAdded.add(dst);
+                       }
+                       pm.update(1);
+               }
+
+               added = newAdded;
+               deleted = new ArrayList<DiffEntry>(nameMap.values());
+       }
+
+       private int calculateModifyScore(ObjectReader reader, DiffEntry d)
+                       throws IOException {
+               SimilarityIndex src = new SimilarityIndex();
+               src.hash(reader.open(d.oldId.toObjectId(), Constants.OBJ_BLOB));
+               src.sort();
+
+               SimilarityIndex dst = new SimilarityIndex();
+               dst.hash(reader.open(d.newId.toObjectId(), Constants.OBJ_BLOB));
+               dst.sort();
+               return src.score(dst, 100);
+       }
+
+       private void findContentRenames(ObjectReader reader, ProgressMonitor pm)
+                       throws IOException {
                int cnt = Math.max(added.size(), deleted.size());
                if (cnt == 0)
                        return;
 
                if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
-                       ObjectReader reader = repo.newObjectReader();
-                       try {
-                               SimilarityRenameDetector d;
-
-                               d = new SimilarityRenameDetector(reader, deleted, added);
-                               d.setRenameScore(getRenameScore());
-                               d.compute(pm);
-                               deleted = d.getLeftOverSources();
-                               added = d.getLeftOverDestinations();
-                               entries.addAll(d.getMatches());
-                       } finally {
-                               reader.release();
-                       }
+                       SimilarityRenameDetector d;
+
+                       d = new SimilarityRenameDetector(reader, deleted, added);
+                       d.setRenameScore(getRenameScore());
+                       d.compute(pm);
+                       deleted = d.getLeftOverSources();
+                       added = d.getLeftOverDestinations();
+                       entries.addAll(d.getMatches());
                } else {
                        overRenameLimit = true;
                }