]> source.dussan.org Git - jgit.git/commitdiff
Added file path similarity to scoring metric in rename detection 93/1093/3
authorJeff Schumacher <jeffschu@google.com>
Fri, 9 Jul 2010 22:11:54 +0000 (15:11 -0700)
committerJeff Schumacher <jeffschu@google.com>
Mon, 12 Jul 2010 19:52:05 +0000 (12:52 -0700)
The scoring method was not taking into account the similarity of
the file paths and file names. I changed the metric so that it is 99%
based on content (which used to be 100% of the old metric), and 1%
based on path similarity. Of that 1%, half (.5% of the total final
score) is based on the actual file names (e.g. "foo.java"), and half
on the directory (e.g. "src/com/foo/bar/").

Change-Id: I94f0c23bf6413c491b10d5625f6ad7d2ecfb4def

org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java

index c4cb600db0b377ba520499b0d8d29f7949050303..fe0c565d6de7a069b8ce2a3021cacae9a91df726 100644 (file)
@@ -124,8 +124,8 @@ public class RenameDetectorTest extends RepositoryTestCase {
        }
 
        public void testInexactRename_OnePair() throws Exception {
-               ObjectId aId = blob("foo\nbar\nbaz\n");
-               ObjectId bId = blob("foo\nbar\nblah\n");
+               ObjectId aId = blob("foo\nbar\nbaz\nblarg\n");
+               ObjectId bId = blob("foo\nbar\nbaz\nblah\n");
 
                DiffEntry a = DiffEntry.add(PATH_A, aId);
                DiffEntry b = DiffEntry.delete(PATH_Q, bId);
@@ -135,12 +135,12 @@ public class RenameDetectorTest extends RepositoryTestCase {
 
                List<DiffEntry> entries = rd.compute();
                assertEquals(1, entries.size());
-               assertRename(b, a, 61, entries.get(0));
+               assertRename(b, a, 66, entries.get(0));
        }
 
        public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception {
-               ObjectId aId = blob("foo\nbar\nbaz\n");
-               ObjectId bId = blob("foo\nbar\nblah\n");
+               ObjectId aId = blob("foo\nbar\nbaz\nblarg\n");
+               ObjectId bId = blob("foo\nbar\nbaz\nblah\n");
                DiffEntry a = DiffEntry.add(PATH_A, aId);
                DiffEntry b = DiffEntry.delete(PATH_Q, bId);
 
@@ -158,7 +158,7 @@ public class RenameDetectorTest extends RepositoryTestCase {
                assertEquals(3, entries.size());
                assertSame(c, entries.get(0));
                assertSame(d, entries.get(1));
-               assertRename(b, a, 61, entries.get(2));
+               assertRename(b, a, 66, entries.get(2));
        }
 
        public void testInexactRename_LastByteDifferent() throws Exception {
index 9ab745fac9947b8fab10720f4fa19b28b6f9ff5c..d6915eb872adbe458dd3300e11d4579a6a9779cb 100644 (file)
@@ -78,8 +78,8 @@ public class SimilarityIndexTest extends TestCase {
                assertEquals(8, src.common(dst));
                assertEquals(8, dst.common(src));
 
-               assertEquals(100, src.score(dst));
-               assertEquals(100, dst.score(src));
+               assertEquals(100, src.score(dst, 100));
+               assertEquals(100, dst.score(src, 100));
        }
 
        public void testCommonScore_EmptyFiles() {
@@ -102,8 +102,8 @@ public class SimilarityIndexTest extends TestCase {
                assertEquals(6, src.common(dst));
                assertEquals(6, dst.common(src));
 
-               assertEquals(75, src.score(dst));
-               assertEquals(75, dst.score(src));
+               assertEquals(75, src.score(dst, 100));
+               assertEquals(75, dst.score(src, 100));
        }
 
        private static SimilarityIndex hash(String text) {
index f4cccfc37d75181ccfa0adb089b0b2c09e016ac4..d5a31d6044a30589cb8660bea7f6c1c6f6a60572 100644 (file)
@@ -142,11 +142,11 @@ class SimilarityIndex {
                Arrays.sort(idHash);
        }
 
-       int score(SimilarityIndex dst) {
+       int score(SimilarityIndex dst, int maxScore) {
                long max = Math.max(fileSize, dst.fileSize);
                if (max == 0)
-                       return 100;
-               return (int) ((common(dst) * 100L) / max);
+                       return maxScore;
+               return (int) ((common(dst) * maxScore) / max);
        }
 
        int common(SimilarityIndex dst) {
index a343fc0625bcd03ceac27df0fe5d1970637030fb..6590f746f3789709f0164bc50adebb17cada1227 100644 (file)
@@ -260,7 +260,14 @@ class SimilarityRenameDetector {
                                }
 
                                SimilarityIndex d = hash(dstEnt.newId.toObjectId());
-                               int score = s.score(d);
+                               int contentScore = s.score(d, 10000);
+
+                               // nameScore returns a value between 0 and 100, but we want it
+                               // to be in the same range as the content score. This allows it
+                               // to be dropped into the pretty formula for the final score.
+                               int nameScore = nameScore(srcEnt.oldName, dstEnt.newName) * 100;
+
+                               int score = (contentScore * 99 + nameScore * 1) / 10000;
 
                                if (score < renameScore) {
                                        pm.update(1);
@@ -280,6 +287,53 @@ class SimilarityRenameDetector {
                return mNext;
        }
 
+       private int nameScore(String a, String b) {
+           int aDirLen = a.lastIndexOf("/") + 1;
+           int bDirLen = b.lastIndexOf("/") + 1;
+
+           int dirMin = Math.min(aDirLen, bDirLen);
+           int dirMax = Math.max(aDirLen, bDirLen);
+
+           final int dirScoreLtr;
+           final int dirScoreRtl;
+
+               if (dirMax == 0) {
+                       dirScoreLtr = 100;
+                       dirScoreRtl = 100;
+               } else {
+                       int dirSim = 0;
+                       for (; dirSim < dirMin; dirSim++) {
+                               if (a.charAt(dirSim) != b.charAt(dirSim))
+                                       break;
+                       }
+                       dirScoreLtr = (dirSim * 100) / dirMax;
+
+                       if (dirScoreLtr == 100) {
+                               dirScoreRtl = 100;
+                       } else {
+                               for (dirSim = 0; dirSim < dirMin; dirSim++) {
+                                       if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
+                                                       - dirSim))
+                                               break;
+                               }
+                               dirScoreRtl = (dirSim * 100) / dirMax;
+                       }
+               }
+
+               int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
+               int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
+
+               int fileSim = 0;
+               for (; fileSim < fileMin; fileSim++) {
+                       if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
+                                       - fileSim))
+                               break;
+               }
+               int fileScore = (fileSim * 100) / fileMax;
+
+               return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
+       }
+
        private SimilarityIndex hash(ObjectId objectId) throws IOException {
                SimilarityIndex r = new SimilarityIndex();
                r.hash(repo.openObject(objectId));