summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Schumacher <jeffschu@google.com>2010-07-22 12:55:28 -0700
committerShawn O. Pearce <spearce@spearce.org>2010-07-27 18:13:32 -0700
commit396fe6da4593645e1b9bada4fe314f6169ab2d17 (patch)
tree7c465cc05f2fec78b2ab3cf29e8ea04831ba0b7c
parentf56a459966c8e5564cb23ccb3c272a0daa05a1aa (diff)
downloadjgit-396fe6da4593645e1b9bada4fe314f6169ab2d17.tar.gz
jgit-396fe6da4593645e1b9bada4fe314f6169ab2d17.zip
Break dissimilar file pairs during diff
File pairs that are very dissimilar during a diff were not being broken apart into their constituent ADD/DELETE pairs. The leads to sub-optimal rename detection. Take, for example, this situation: A file exists at src/a.txt containing "foo". A user renames src/a.txt to src/b.txt, then adds a new src/a.txt containing "bar". Even though the old a.txt and the new b.txt are identical, the rename detection algorithm would not detect it as a rename since it was already paired in a MODIFY. I added code to split all MODIFYs below a certain score into their constituent ADD/DELETE pairs. This allows situations like the one I described above to be more correctly handled. Change-Id: I22c04b70581f206bbc68c4cd1ee87a1f663b418e Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
-rw-r--r--org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java121
-rw-r--r--org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties2
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java2
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java12
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java8
-rw-r--r--org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java152
6 files changed, 276 insertions, 21 deletions
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
index bfc51b68f9..eb965cf601 100644
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
@@ -383,6 +383,116 @@ public class RenameDetectorTest extends RepositoryTestCase {
assertSame(b, entries.get(1));
}
+ public void testBreakModify_BreakAll() throws Exception {
+ ObjectId aId = blob("foo");
+ ObjectId bId = blob("bar");
+
+ DiffEntry m = DiffEntry.modify(PATH_A);
+ m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+ m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+ DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+ rd.add(a);
+ rd.add(m);
+
+ rd.setBreakScore(101);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0));
+ assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1));
+ }
+
+ public void testBreakModify_BreakNone() throws Exception {
+ ObjectId aId = blob("foo");
+ ObjectId bId = blob("bar");
+
+ DiffEntry m = DiffEntry.modify(PATH_A);
+ m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+ m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+ DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+ rd.add(a);
+ rd.add(m);
+
+ rd.setBreakScore(-1);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertSame(m, entries.get(0));
+ assertSame(a, entries.get(1));
+ }
+
+ public void testBreakModify_BreakBelowScore() throws Exception {
+ ObjectId aId = blob("foo");
+ ObjectId bId = blob("bar");
+
+ DiffEntry m = DiffEntry.modify(PATH_A);
+ m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+ m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+ DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+ rd.add(a);
+ rd.add(m);
+
+ rd.setBreakScore(20); // Should break the modify
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0));
+ assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1));
+ }
+
+ public void testBreakModify_DontBreakAboveScore() throws Exception {
+ ObjectId aId = blob("blah\nblah\nfoo");
+ ObjectId bId = blob("blah\nblah\nbar");
+
+ DiffEntry m = DiffEntry.modify(PATH_A);
+ m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+ m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+ DiffEntry a = DiffEntry.add(PATH_B, aId);
+
+ rd.add(a);
+ rd.add(m);
+
+ rd.setBreakScore(20); // Should not break the modify
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertSame(m, entries.get(0));
+ assertSame(a, entries.get(1));
+ }
+
+ public void testBreakModify_RejoinIfUnpaired() throws Exception {
+ ObjectId aId = blob("foo");
+ ObjectId bId = blob("bar");
+
+ DiffEntry m = DiffEntry.modify(PATH_A);
+ m.oldId = AbbreviatedObjectId.fromObjectId(aId);
+ m.newId = AbbreviatedObjectId.fromObjectId(bId);
+
+ rd.add(m);
+
+ rd.setBreakScore(101); // Ensure m is broken apart
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(1, entries.size());
+
+ DiffEntry modify = entries.get(0);
+ assertEquals(m.oldName, modify.oldName);
+ assertEquals(m.oldId, modify.oldId);
+ assertEquals(m.oldMode, modify.oldMode);
+ assertEquals(m.newName, modify.newName);
+ assertEquals(m.newId, modify.newId);
+ assertEquals(m.newMode, modify.newMode);
+ assertEquals(m.changeType, modify.changeType);
+ assertEquals(0, modify.score);
+ }
+
public void testSetRenameScore_IllegalArgs() throws Exception {
try {
rd.setRenameScore(-1);
@@ -462,4 +572,15 @@ public class RenameDetectorTest extends RepositoryTestCase {
assertEquals(score, copy.getScore());
}
+
+ private static void assertAdd(String newName, ObjectId newId,
+ FileMode newMode, DiffEntry add) {
+ assertEquals(DiffEntry.DEV_NULL, add.oldName);
+ assertEquals(DiffEntry.A_ZERO, add.oldId);
+ assertEquals(FileMode.MISSING, add.oldMode);
+ assertEquals(ChangeType.ADD, add.changeType);
+ assertEquals(newName, add.newName);
+ assertEquals(AbbreviatedObjectId.fromObjectId(newId), add.newId);
+ assertEquals(newMode, add.newMode);
+ }
}
diff --git a/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties b/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties
index 88c6c8e3a4..1b2b81fce3 100644
--- a/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties
+++ b/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties
@@ -302,8 +302,10 @@ remoteDoesNotSupportSmartHTTPPush=remote does not support smart HTTP push
remoteHungUpUnexpectedly=remote hung up unexpectedly
remoteNameCantBeNull=Remote name can't be null.
renamesAlreadyFound=Renames have already been found.
+renamesBreakingModifies=Breaking apart modified file pairs
renamesFindingByContent=Finding renames by content similarity
renamesFindingExact=Finding exact renames
+renamesRejoiningModifies=Rejoining modified file pairs
repositoryAlreadyExists=Repository already exists: {0}
repositoryConfigFileInvalid=Repository config file {0} invalid {1}
repositoryIsRequired=Repository is required.
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java b/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java
index 9df9887af4..9d1e2cd808 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java
@@ -361,8 +361,10 @@ public class JGitText extends TranslationBundle {
/***/ public String remoteHungUpUnexpectedly;
/***/ public String remoteNameCantBeNull;
/***/ public String renamesAlreadyFound;
+ /***/ public String renamesBreakingModifies;
/***/ public String renamesFindingByContent;
/***/ public String renamesFindingExact;
+ /***/ public String renamesRejoiningModifies;
/***/ public String repositoryAlreadyExists;
/***/ public String repositoryConfigFileInvalid;
/***/ public String repositoryIsRequired;
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java
index 304e4cff39..7dbcdaa6d9 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java
@@ -57,7 +57,8 @@ import org.eclipse.jgit.treewalk.TreeWalk;
/** A value class representing a change to a file */
public class DiffEntry {
- private static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId
+ /** Magical SHA1 used for file adds or deletes */
+ static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId
.fromObjectId(ObjectId.zeroId());
/** Magical file name used for file adds or deletes. */
@@ -171,6 +172,15 @@ public class DiffEntry {
return e;
}
+ /**
+ * Breaks apart a DiffEntry into two entries, one DELETE and one ADD.
+ *
+ * @param entry
+ * the DiffEntry to break apart.
+ * @return a list containing two entries. Calling {@link #getChangeType()}
+ * on the first entry will return ChangeType.DELETE. Calling it on
+ * the second entry will return ChangeType.ADD.
+ */
static List<DiffEntry> breakModify(DiffEntry entry) {
DiffEntry del = new DiffEntry();
del.oldId = entry.getOldId();
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java
index 835cf6f865..4ee742ae16 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java
@@ -290,6 +290,14 @@ public class DiffFormatter {
o.write('\n');
}
break;
+ case MODIFY:
+ int score = ent.getScore();
+ if (0 < score && score <= 100) {
+ o.write(encodeASCII("dissimilarity index " + (100 - score)
+ + "%"));
+ o.write('\n');
+ }
+ break;
}
switch (ent.getChangeType()) {
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
index a3203e349f..fedd7cda15 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java
@@ -55,6 +55,7 @@ import java.util.List;
import org.eclipse.jgit.JGitText;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
import org.eclipse.jgit.lib.AbbreviatedObjectId;
+import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.NullProgressMonitor;
import org.eclipse.jgit.lib.ObjectReader;
@@ -99,7 +100,7 @@ public class RenameDetector {
}
};
- private final List<DiffEntry> entries = new ArrayList<DiffEntry>();
+ private List<DiffEntry> entries = new ArrayList<DiffEntry>();
private List<DiffEntry> deleted = new ArrayList<DiffEntry>();
@@ -112,6 +113,13 @@ public class RenameDetector {
/** Similarity score required to pair an add/delete as a rename. */
private int renameScore = 60;
+ /**
+ * Similarity score required to keep modified file pairs together. Any
+ * modified file pairs with a similarity score below this will be broken
+ * apart.
+ */
+ private int breakScore = -1;
+
/** Limit in the number of files to consider for renames. */
private int renameLimit;
@@ -159,6 +167,29 @@ public class RenameDetector {
renameScore = score;
}
+ /**
+ * @return the similarity score required to keep modified file pairs
+ * together. Any modify pairs that score below this will be broken
+ * apart into separate add/deletes. Values less than or equal to
+ * zero indicate that no modifies will be broken apart. Values over
+ * 100 cause all modify pairs to be broken.
+ */
+ public int getBreakScore() {
+ return breakScore;
+ }
+
+ /**
+ * @param breakScore
+ * the similarity score required to keep modified file pairs
+ * together. Any modify pairs that score below this will be
+ * broken apart into separate add/deletes. Values less than or
+ * equal to zero indicate that no modifies will be broken apart.
+ * Values over 100 cause all modify pairs to be broken.
+ */
+ public void setBreakScore(int breakScore) {
+ this.breakScore = breakScore;
+ }
+
/** @return limit on number of paths to perform inexact rename detection. */
public int getRenameLimit() {
return renameLimit;
@@ -219,10 +250,13 @@ public class RenameDetector {
break;
case MODIFY:
- if (sameType(entry.getOldMode(), entry.getNewMode()))
+ if (sameType(entry.getOldMode(), entry.getNewMode())) {
entries.add(entry);
- else
- entries.addAll(DiffEntry.breakModify(entry));
+ } else {
+ List<DiffEntry> tmp = DiffEntry.breakModify(entry);
+ deleted.add(tmp.get(0));
+ added.add(tmp.get(1));
+ }
break;
case COPY:
@@ -275,8 +309,15 @@ public class RenameDetector {
if (pm == null)
pm = NullProgressMonitor.INSTANCE;
- findExactRenames(pm);
- findContentRenames(pm);
+ ObjectReader reader = repo.newObjectReader();
+ try {
+ breakModifies(reader, pm);
+ findExactRenames(pm);
+ findContentRenames(reader, pm);
+ rejoinModifies(pm);
+ } finally {
+ reader.release();
+ }
entries.addAll(added);
added = null;
@@ -289,25 +330,96 @@ public class RenameDetector {
return Collections.unmodifiableList(entries);
}
- private void findContentRenames(ProgressMonitor pm) throws IOException {
+ private void breakModifies(ObjectReader reader, ProgressMonitor pm)
+ throws IOException {
+ if (breakScore <= 0)
+ return;
+
+ ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size());
+
+ pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
+
+ for (int i = 0; i < entries.size(); i++) {
+ DiffEntry e = entries.get(i);
+ if (e.getChangeType() == ChangeType.MODIFY) {
+ int score = calculateModifyScore(reader, e);
+ if (score < breakScore) {
+ List<DiffEntry> tmp = DiffEntry.breakModify(e);
+ DiffEntry del = tmp.get(0);
+ del.score = score;
+ deleted.add(del);
+ added.add(tmp.get(1));
+ } else {
+ newEntries.add(e);
+ }
+ } else {
+ newEntries.add(e);
+ }
+ pm.update(1);
+ }
+
+ entries = newEntries;
+ }
+
+ private void rejoinModifies(ProgressMonitor pm) {
+ HashMap<String, DiffEntry> nameMap = new HashMap<String, DiffEntry>();
+ ArrayList<DiffEntry> newAdded = new ArrayList<DiffEntry>(added.size());
+
+ pm.beginTask(JGitText.get().renamesRejoiningModifies, added.size()
+ + deleted.size());
+
+ for (DiffEntry src : deleted) {
+ nameMap.put(src.oldName, src);
+ pm.update(1);
+ }
+
+ for (DiffEntry dst : added) {
+ DiffEntry src = nameMap.remove(dst.newName);
+ if (src != null) {
+ if (sameType(src.oldMode, dst.newMode)) {
+ entries.add(DiffEntry.pair(ChangeType.MODIFY, src, dst,
+ src.score));
+ } else {
+ nameMap.put(src.oldName, src);
+ newAdded.add(dst);
+ }
+ } else {
+ newAdded.add(dst);
+ }
+ pm.update(1);
+ }
+
+ added = newAdded;
+ deleted = new ArrayList<DiffEntry>(nameMap.values());
+ }
+
+ private int calculateModifyScore(ObjectReader reader, DiffEntry d)
+ throws IOException {
+ SimilarityIndex src = new SimilarityIndex();
+ src.hash(reader.open(d.oldId.toObjectId(), Constants.OBJ_BLOB));
+ src.sort();
+
+ SimilarityIndex dst = new SimilarityIndex();
+ dst.hash(reader.open(d.newId.toObjectId(), Constants.OBJ_BLOB));
+ dst.sort();
+ return src.score(dst, 100);
+ }
+
+ private void findContentRenames(ObjectReader reader, ProgressMonitor pm)
+ throws IOException {
int cnt = Math.max(added.size(), deleted.size());
if (cnt == 0)
return;
if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
- ObjectReader reader = repo.newObjectReader();
- try {
- SimilarityRenameDetector d;
-
- d = new SimilarityRenameDetector(reader, deleted, added);
- d.setRenameScore(getRenameScore());
- d.compute(pm);
- deleted = d.getLeftOverSources();
- added = d.getLeftOverDestinations();
- entries.addAll(d.getMatches());
- } finally {
- reader.release();
- }
+ SimilarityRenameDetector d;
+
+ d = new SimilarityRenameDetector(reader, deleted, added);
+ d.setRenameScore(getRenameScore());
+ d.compute(pm);
+ deleted = d.getLeftOverSources();
+ added = d.getLeftOverDestinations();
+ entries.addAll(d.getMatches());
} else {
overRenameLimit = true;
}