The scoring method was not taking into account the similarity of the file paths and file names. I changed the metric so that it is 99% based on content (which used to be 100% of the old metric), and 1% based on path similarity. Of that 1%, half (.5% of the total final score) is based on the actual file names (e.g. "foo.java"), and half on the directory (e.g. "src/com/foo/bar/"). Change-Id: I94f0c23bf6413c491b10d5625f6ad7d2ecfb4deftags/v0.9.1
@@ -124,8 +124,8 @@ public class RenameDetectorTest extends RepositoryTestCase { | |||
} | |||
public void testInexactRename_OnePair() throws Exception { | |||
ObjectId aId = blob("foo\nbar\nbaz\n"); | |||
ObjectId bId = blob("foo\nbar\nblah\n"); | |||
ObjectId aId = blob("foo\nbar\nbaz\nblarg\n"); | |||
ObjectId bId = blob("foo\nbar\nbaz\nblah\n"); | |||
DiffEntry a = DiffEntry.add(PATH_A, aId); | |||
DiffEntry b = DiffEntry.delete(PATH_Q, bId); | |||
@@ -135,12 +135,12 @@ public class RenameDetectorTest extends RepositoryTestCase { | |||
List<DiffEntry> entries = rd.compute(); | |||
assertEquals(1, entries.size()); | |||
assertRename(b, a, 61, entries.get(0)); | |||
assertRename(b, a, 66, entries.get(0)); | |||
} | |||
public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception { | |||
ObjectId aId = blob("foo\nbar\nbaz\n"); | |||
ObjectId bId = blob("foo\nbar\nblah\n"); | |||
ObjectId aId = blob("foo\nbar\nbaz\nblarg\n"); | |||
ObjectId bId = blob("foo\nbar\nbaz\nblah\n"); | |||
DiffEntry a = DiffEntry.add(PATH_A, aId); | |||
DiffEntry b = DiffEntry.delete(PATH_Q, bId); | |||
@@ -158,7 +158,7 @@ public class RenameDetectorTest extends RepositoryTestCase { | |||
assertEquals(3, entries.size()); | |||
assertSame(c, entries.get(0)); | |||
assertSame(d, entries.get(1)); | |||
assertRename(b, a, 61, entries.get(2)); | |||
assertRename(b, a, 66, entries.get(2)); | |||
} | |||
public void testInexactRename_LastByteDifferent() throws Exception { |
@@ -78,8 +78,8 @@ public class SimilarityIndexTest extends TestCase { | |||
assertEquals(8, src.common(dst)); | |||
assertEquals(8, dst.common(src)); | |||
assertEquals(100, src.score(dst)); | |||
assertEquals(100, dst.score(src)); | |||
assertEquals(100, src.score(dst, 100)); | |||
assertEquals(100, dst.score(src, 100)); | |||
} | |||
public void testCommonScore_EmptyFiles() { | |||
@@ -102,8 +102,8 @@ public class SimilarityIndexTest extends TestCase { | |||
assertEquals(6, src.common(dst)); | |||
assertEquals(6, dst.common(src)); | |||
assertEquals(75, src.score(dst)); | |||
assertEquals(75, dst.score(src)); | |||
assertEquals(75, src.score(dst, 100)); | |||
assertEquals(75, dst.score(src, 100)); | |||
} | |||
private static SimilarityIndex hash(String text) { |
@@ -142,11 +142,11 @@ class SimilarityIndex { | |||
Arrays.sort(idHash); | |||
} | |||
int score(SimilarityIndex dst) { | |||
int score(SimilarityIndex dst, int maxScore) { | |||
long max = Math.max(fileSize, dst.fileSize); | |||
if (max == 0) | |||
return 100; | |||
return (int) ((common(dst) * 100L) / max); | |||
return maxScore; | |||
return (int) ((common(dst) * maxScore) / max); | |||
} | |||
int common(SimilarityIndex dst) { |
@@ -260,7 +260,14 @@ class SimilarityRenameDetector { | |||
} | |||
SimilarityIndex d = hash(dstEnt.newId.toObjectId()); | |||
int score = s.score(d); | |||
int contentScore = s.score(d, 10000); | |||
// nameScore returns a value between 0 and 100, but we want it | |||
// to be in the same range as the content score. This allows it | |||
// to be dropped into the pretty formula for the final score. | |||
int nameScore = nameScore(srcEnt.oldName, dstEnt.newName) * 100; | |||
int score = (contentScore * 99 + nameScore * 1) / 10000; | |||
if (score < renameScore) { | |||
pm.update(1); | |||
@@ -280,6 +287,53 @@ class SimilarityRenameDetector { | |||
return mNext; | |||
} | |||
private int nameScore(String a, String b) { | |||
int aDirLen = a.lastIndexOf("/") + 1; | |||
int bDirLen = b.lastIndexOf("/") + 1; | |||
int dirMin = Math.min(aDirLen, bDirLen); | |||
int dirMax = Math.max(aDirLen, bDirLen); | |||
final int dirScoreLtr; | |||
final int dirScoreRtl; | |||
if (dirMax == 0) { | |||
dirScoreLtr = 100; | |||
dirScoreRtl = 100; | |||
} else { | |||
int dirSim = 0; | |||
for (; dirSim < dirMin; dirSim++) { | |||
if (a.charAt(dirSim) != b.charAt(dirSim)) | |||
break; | |||
} | |||
dirScoreLtr = (dirSim * 100) / dirMax; | |||
if (dirScoreLtr == 100) { | |||
dirScoreRtl = 100; | |||
} else { | |||
for (dirSim = 0; dirSim < dirMin; dirSim++) { | |||
if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1 | |||
- dirSim)) | |||
break; | |||
} | |||
dirScoreRtl = (dirSim * 100) / dirMax; | |||
} | |||
} | |||
int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen); | |||
int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen); | |||
int fileSim = 0; | |||
for (; fileSim < fileMin; fileSim++) { | |||
if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1 | |||
- fileSim)) | |||
break; | |||
} | |||
int fileScore = (fileSim * 100) / fileMax; | |||
return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100; | |||
} | |||
private SimilarityIndex hash(ObjectId objectId) throws IOException { | |||
SimilarityIndex r = new SimilarityIndex(); | |||
r.hash(repo.openObject(objectId)); |