diff options
author | Jeff Schumacher <jeffschu@google.com> | 2010-07-22 12:55:28 -0700 |
---|---|---|
committer | Shawn O. Pearce <spearce@spearce.org> | 2010-07-27 18:13:32 -0700 |
commit | 396fe6da4593645e1b9bada4fe314f6169ab2d17 (patch) | |
tree | 7c465cc05f2fec78b2ab3cf29e8ea04831ba0b7c | |
parent | f56a459966c8e5564cb23ccb3c272a0daa05a1aa (diff) | |
download | jgit-396fe6da4593645e1b9bada4fe314f6169ab2d17.tar.gz jgit-396fe6da4593645e1b9bada4fe314f6169ab2d17.zip |
Break dissimilar file pairs during diff
File pairs that are very dissimilar during a diff were not being
broken apart into their constituent ADD/DELETE pairs. The leads to
sub-optimal rename detection. Take, for example, this situation:
A file exists at src/a.txt containing "foo". A user renames src/a.txt
to src/b.txt, then adds a new src/a.txt containing "bar".
Even though the old a.txt and the new b.txt are identical, the
rename detection algorithm would not detect it as a rename since
it was already paired in a MODIFY. I added code to split all
MODIFYs below a certain score into their constituent ADD/DELETE
pairs. This allows situations like the one I described above to be
more correctly handled.
Change-Id: I22c04b70581f206bbc68c4cd1ee87a1f663b418e
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
6 files changed, 276 insertions, 21 deletions
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java index bfc51b68f9..eb965cf601 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java @@ -383,6 +383,116 @@ public class RenameDetectorTest extends RepositoryTestCase { assertSame(b, entries.get(1)); } + public void testBreakModify_BreakAll() throws Exception { + ObjectId aId = blob("foo"); + ObjectId bId = blob("bar"); + + DiffEntry m = DiffEntry.modify(PATH_A); + m.oldId = AbbreviatedObjectId.fromObjectId(aId); + m.newId = AbbreviatedObjectId.fromObjectId(bId); + + DiffEntry a = DiffEntry.add(PATH_B, aId); + + rd.add(a); + rd.add(m); + + rd.setBreakScore(101); + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0)); + assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1)); + } + + public void testBreakModify_BreakNone() throws Exception { + ObjectId aId = blob("foo"); + ObjectId bId = blob("bar"); + + DiffEntry m = DiffEntry.modify(PATH_A); + m.oldId = AbbreviatedObjectId.fromObjectId(aId); + m.newId = AbbreviatedObjectId.fromObjectId(bId); + + DiffEntry a = DiffEntry.add(PATH_B, aId); + + rd.add(a); + rd.add(m); + + rd.setBreakScore(-1); + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertSame(m, entries.get(0)); + assertSame(a, entries.get(1)); + } + + public void testBreakModify_BreakBelowScore() throws Exception { + ObjectId aId = blob("foo"); + ObjectId bId = blob("bar"); + + DiffEntry m = DiffEntry.modify(PATH_A); + m.oldId = AbbreviatedObjectId.fromObjectId(aId); + m.newId = AbbreviatedObjectId.fromObjectId(bId); + + DiffEntry a = DiffEntry.add(PATH_B, aId); + + rd.add(a); + rd.add(m); + + rd.setBreakScore(20); // Should break the modify + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertAdd(PATH_A, bId, FileMode.REGULAR_FILE, entries.get(0)); + assertRename(DiffEntry.breakModify(m).get(0), a, 100, entries.get(1)); + } + + public void testBreakModify_DontBreakAboveScore() throws Exception { + ObjectId aId = blob("blah\nblah\nfoo"); + ObjectId bId = blob("blah\nblah\nbar"); + + DiffEntry m = DiffEntry.modify(PATH_A); + m.oldId = AbbreviatedObjectId.fromObjectId(aId); + m.newId = AbbreviatedObjectId.fromObjectId(bId); + + DiffEntry a = DiffEntry.add(PATH_B, aId); + + rd.add(a); + rd.add(m); + + rd.setBreakScore(20); // Should not break the modify + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertSame(m, entries.get(0)); + assertSame(a, entries.get(1)); + } + + public void testBreakModify_RejoinIfUnpaired() throws Exception { + ObjectId aId = blob("foo"); + ObjectId bId = blob("bar"); + + DiffEntry m = DiffEntry.modify(PATH_A); + m.oldId = AbbreviatedObjectId.fromObjectId(aId); + m.newId = AbbreviatedObjectId.fromObjectId(bId); + + rd.add(m); + + rd.setBreakScore(101); // Ensure m is broken apart + + List<DiffEntry> entries = rd.compute(); + assertEquals(1, entries.size()); + + DiffEntry modify = entries.get(0); + assertEquals(m.oldName, modify.oldName); + assertEquals(m.oldId, modify.oldId); + assertEquals(m.oldMode, modify.oldMode); + assertEquals(m.newName, modify.newName); + assertEquals(m.newId, modify.newId); + assertEquals(m.newMode, modify.newMode); + assertEquals(m.changeType, modify.changeType); + assertEquals(0, modify.score); + } + public void testSetRenameScore_IllegalArgs() throws Exception { try { rd.setRenameScore(-1); @@ -462,4 +572,15 @@ public class RenameDetectorTest extends RepositoryTestCase { assertEquals(score, copy.getScore()); } + + private static void assertAdd(String newName, ObjectId newId, + FileMode newMode, DiffEntry add) { + assertEquals(DiffEntry.DEV_NULL, add.oldName); + assertEquals(DiffEntry.A_ZERO, add.oldId); + assertEquals(FileMode.MISSING, add.oldMode); + assertEquals(ChangeType.ADD, add.changeType); + assertEquals(newName, add.newName); + assertEquals(AbbreviatedObjectId.fromObjectId(newId), add.newId); + assertEquals(newMode, add.newMode); + } } diff --git a/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties b/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties index 88c6c8e3a4..1b2b81fce3 100644 --- a/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties +++ b/org.eclipse.jgit/resources/org/eclipse/jgit/JGitText.properties @@ -302,8 +302,10 @@ remoteDoesNotSupportSmartHTTPPush=remote does not support smart HTTP push remoteHungUpUnexpectedly=remote hung up unexpectedly remoteNameCantBeNull=Remote name can't be null. renamesAlreadyFound=Renames have already been found. +renamesBreakingModifies=Breaking apart modified file pairs renamesFindingByContent=Finding renames by content similarity renamesFindingExact=Finding exact renames +renamesRejoiningModifies=Rejoining modified file pairs repositoryAlreadyExists=Repository already exists: {0} repositoryConfigFileInvalid=Repository config file {0} invalid {1} repositoryIsRequired=Repository is required. diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java b/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java index 9df9887af4..9d1e2cd808 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/JGitText.java @@ -361,8 +361,10 @@ public class JGitText extends TranslationBundle { /***/ public String remoteHungUpUnexpectedly; /***/ public String remoteNameCantBeNull; /***/ public String renamesAlreadyFound; + /***/ public String renamesBreakingModifies; /***/ public String renamesFindingByContent; /***/ public String renamesFindingExact; + /***/ public String renamesRejoiningModifies; /***/ public String repositoryAlreadyExists; /***/ public String repositoryConfigFileInvalid; /***/ public String repositoryIsRequired; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java index 304e4cff39..7dbcdaa6d9 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffEntry.java @@ -57,7 +57,8 @@ import org.eclipse.jgit.treewalk.TreeWalk; /** A value class representing a change to a file */ public class DiffEntry { - private static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId + /** Magical SHA1 used for file adds or deletes */ + static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId .fromObjectId(ObjectId.zeroId()); /** Magical file name used for file adds or deletes. */ @@ -171,6 +172,15 @@ public class DiffEntry { return e; } + /** + * Breaks apart a DiffEntry into two entries, one DELETE and one ADD. + * + * @param entry + * the DiffEntry to break apart. + * @return a list containing two entries. Calling {@link #getChangeType()} + * on the first entry will return ChangeType.DELETE. Calling it on + * the second entry will return ChangeType.ADD. + */ static List<DiffEntry> breakModify(DiffEntry entry) { DiffEntry del = new DiffEntry(); del.oldId = entry.getOldId(); diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java index 835cf6f865..4ee742ae16 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/DiffFormatter.java @@ -290,6 +290,14 @@ public class DiffFormatter { o.write('\n'); } break; + case MODIFY: + int score = ent.getScore(); + if (0 < score && score <= 100) { + o.write(encodeASCII("dissimilarity index " + (100 - score) + + "%")); + o.write('\n'); + } + break; } switch (ent.getChangeType()) { diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java index a3203e349f..fedd7cda15 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java @@ -55,6 +55,7 @@ import java.util.List; import org.eclipse.jgit.JGitText; import org.eclipse.jgit.diff.DiffEntry.ChangeType; import org.eclipse.jgit.lib.AbbreviatedObjectId; +import org.eclipse.jgit.lib.Constants; import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.NullProgressMonitor; import org.eclipse.jgit.lib.ObjectReader; @@ -99,7 +100,7 @@ public class RenameDetector { } }; - private final List<DiffEntry> entries = new ArrayList<DiffEntry>(); + private List<DiffEntry> entries = new ArrayList<DiffEntry>(); private List<DiffEntry> deleted = new ArrayList<DiffEntry>(); @@ -112,6 +113,13 @@ public class RenameDetector { /** Similarity score required to pair an add/delete as a rename. */ private int renameScore = 60; + /** + * Similarity score required to keep modified file pairs together. Any + * modified file pairs with a similarity score below this will be broken + * apart. + */ + private int breakScore = -1; + /** Limit in the number of files to consider for renames. */ private int renameLimit; @@ -159,6 +167,29 @@ public class RenameDetector { renameScore = score; } + /** + * @return the similarity score required to keep modified file pairs + * together. Any modify pairs that score below this will be broken + * apart into separate add/deletes. Values less than or equal to + * zero indicate that no modifies will be broken apart. Values over + * 100 cause all modify pairs to be broken. + */ + public int getBreakScore() { + return breakScore; + } + + /** + * @param breakScore + * the similarity score required to keep modified file pairs + * together. Any modify pairs that score below this will be + * broken apart into separate add/deletes. Values less than or + * equal to zero indicate that no modifies will be broken apart. + * Values over 100 cause all modify pairs to be broken. + */ + public void setBreakScore(int breakScore) { + this.breakScore = breakScore; + } + /** @return limit on number of paths to perform inexact rename detection. */ public int getRenameLimit() { return renameLimit; @@ -219,10 +250,13 @@ public class RenameDetector { break; case MODIFY: - if (sameType(entry.getOldMode(), entry.getNewMode())) + if (sameType(entry.getOldMode(), entry.getNewMode())) { entries.add(entry); - else - entries.addAll(DiffEntry.breakModify(entry)); + } else { + List<DiffEntry> tmp = DiffEntry.breakModify(entry); + deleted.add(tmp.get(0)); + added.add(tmp.get(1)); + } break; case COPY: @@ -275,8 +309,15 @@ public class RenameDetector { if (pm == null) pm = NullProgressMonitor.INSTANCE; - findExactRenames(pm); - findContentRenames(pm); + ObjectReader reader = repo.newObjectReader(); + try { + breakModifies(reader, pm); + findExactRenames(pm); + findContentRenames(reader, pm); + rejoinModifies(pm); + } finally { + reader.release(); + } entries.addAll(added); added = null; @@ -289,25 +330,96 @@ public class RenameDetector { return Collections.unmodifiableList(entries); } - private void findContentRenames(ProgressMonitor pm) throws IOException { + private void breakModifies(ObjectReader reader, ProgressMonitor pm) + throws IOException { + if (breakScore <= 0) + return; + + ArrayList<DiffEntry> newEntries = new ArrayList<DiffEntry>(entries.size()); + + pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size()); + + for (int i = 0; i < entries.size(); i++) { + DiffEntry e = entries.get(i); + if (e.getChangeType() == ChangeType.MODIFY) { + int score = calculateModifyScore(reader, e); + if (score < breakScore) { + List<DiffEntry> tmp = DiffEntry.breakModify(e); + DiffEntry del = tmp.get(0); + del.score = score; + deleted.add(del); + added.add(tmp.get(1)); + } else { + newEntries.add(e); + } + } else { + newEntries.add(e); + } + pm.update(1); + } + + entries = newEntries; + } + + private void rejoinModifies(ProgressMonitor pm) { + HashMap<String, DiffEntry> nameMap = new HashMap<String, DiffEntry>(); + ArrayList<DiffEntry> newAdded = new ArrayList<DiffEntry>(added.size()); + + pm.beginTask(JGitText.get().renamesRejoiningModifies, added.size() + + deleted.size()); + + for (DiffEntry src : deleted) { + nameMap.put(src.oldName, src); + pm.update(1); + } + + for (DiffEntry dst : added) { + DiffEntry src = nameMap.remove(dst.newName); + if (src != null) { + if (sameType(src.oldMode, dst.newMode)) { + entries.add(DiffEntry.pair(ChangeType.MODIFY, src, dst, + src.score)); + } else { + nameMap.put(src.oldName, src); + newAdded.add(dst); + } + } else { + newAdded.add(dst); + } + pm.update(1); + } + + added = newAdded; + deleted = new ArrayList<DiffEntry>(nameMap.values()); + } + + private int calculateModifyScore(ObjectReader reader, DiffEntry d) + throws IOException { + SimilarityIndex src = new SimilarityIndex(); + src.hash(reader.open(d.oldId.toObjectId(), Constants.OBJ_BLOB)); + src.sort(); + + SimilarityIndex dst = new SimilarityIndex(); + dst.hash(reader.open(d.newId.toObjectId(), Constants.OBJ_BLOB)); + dst.sort(); + return src.score(dst, 100); + } + + private void findContentRenames(ObjectReader reader, ProgressMonitor pm) + throws IOException { int cnt = Math.max(added.size(), deleted.size()); if (cnt == 0) return; if (getRenameLimit() == 0 || cnt <= getRenameLimit()) { - ObjectReader reader = repo.newObjectReader(); - try { - SimilarityRenameDetector d; - - d = new SimilarityRenameDetector(reader, deleted, added); - d.setRenameScore(getRenameScore()); - d.compute(pm); - deleted = d.getLeftOverSources(); - added = d.getLeftOverDestinations(); - entries.addAll(d.getMatches()); - } finally { - reader.release(); - } + SimilarityRenameDetector d; + + d = new SimilarityRenameDetector(reader, deleted, added); + d.setRenameScore(getRenameScore()); + d.compute(pm); + deleted = d.getLeftOverSources(); + added = d.getLeftOverDestinations(); + entries.addAll(d.getMatches()); } else { overRenameLimit = true; } |