The scoring method was not taking into account the similarity of the file paths and file names. I changed the metric so that it is 99% based on content (which used to be 100% of the old metric), and 1% based on path similarity. Of that 1%, half (.5% of the total final score) is based on the actual file names (e.g. "foo.java"), and half on the directory (e.g. "src/com/foo/bar/"). Change-Id: I94f0c23bf6413c491b10d5625f6ad7d2ecfb4def

14 years ago · 9a48de86d8
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java
@@ -124,8 +124,8 @@ public class RenameDetectorTest extends RepositoryTestCase {
 	}

 	public void testInexactRename_OnePair() throws Exception {
 		ObjectId aId = blob("foo\nbar\nbaz\n");
 		ObjectId bId = blob("foo\nbar\nblah\n");
 		ObjectId aId = blob("foo\nbar\nbaz\nblarg\n");
 		ObjectId bId = blob("foo\nbar\nbaz\nblah\n");

 		DiffEntry a = DiffEntry.add(PATH_A, aId);
 		DiffEntry b = DiffEntry.delete(PATH_Q, bId);
@@ -135,12 +135,12 @@ public class RenameDetectorTest extends RepositoryTestCase {

 		List<DiffEntry> entries = rd.compute();
 		assertEquals(1, entries.size());
 		assertRename(b, a, 61, entries.get(0));
 		assertRename(b, a, 66, entries.get(0));
 	}

 	public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception {
 		ObjectId aId = blob("foo\nbar\nbaz\n");
 		ObjectId bId = blob("foo\nbar\nblah\n");
 		ObjectId aId = blob("foo\nbar\nbaz\nblarg\n");
 		ObjectId bId = blob("foo\nbar\nbaz\nblah\n");
 		DiffEntry a = DiffEntry.add(PATH_A, aId);
 		DiffEntry b = DiffEntry.delete(PATH_Q, bId);

@@ -158,7 +158,7 @@ public class RenameDetectorTest extends RepositoryTestCase {
 		assertEquals(3, entries.size());
 		assertSame(c, entries.get(0));
 		assertSame(d, entries.get(1));
 		assertRename(b, a, 61, entries.get(2));
 		assertRename(b, a, 66, entries.get(2));
 	}

 	public void testInexactRename_LastByteDifferent() throws Exception {
--- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java
@@ -78,8 +78,8 @@ public class SimilarityIndexTest extends TestCase {
 		assertEquals(8, src.common(dst));
 		assertEquals(8, dst.common(src));

 		assertEquals(100, src.score(dst));
 		assertEquals(100, dst.score(src));
 		assertEquals(100, src.score(dst, 100));
 		assertEquals(100, dst.score(src, 100));
 	}

 	public void testCommonScore_EmptyFiles() {
@@ -102,8 +102,8 @@ public class SimilarityIndexTest extends TestCase {
 		assertEquals(6, src.common(dst));
 		assertEquals(6, dst.common(src));

 		assertEquals(75, src.score(dst));
 		assertEquals(75, dst.score(src));
 		assertEquals(75, src.score(dst, 100));
 		assertEquals(75, dst.score(src, 100));
 	}

 	private static SimilarityIndex hash(String text) {
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java
@@ -142,11 +142,11 @@ class SimilarityIndex {
 		Arrays.sort(idHash);
 	}

 	int score(SimilarityIndex dst) {
 	int score(SimilarityIndex dst, int maxScore) {
 		long max = Math.max(fileSize, dst.fileSize);
 		if (max == 0)
 			return 100;
 		return (int) ((common(dst) * 100L) / max);
 			return maxScore;
 		return (int) ((common(dst) * maxScore) / max);
 	}

 	int common(SimilarityIndex dst) {
--- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java
@@ -260,7 +260,14 @@ class SimilarityRenameDetector {
 				}

 				SimilarityIndex d = hash(dstEnt.newId.toObjectId());
 				int score = s.score(d);
 				int contentScore = s.score(d, 10000);

 				// nameScore returns a value between 0 and 100, but we want it
 				// to be in the same range as the content score. This allows it
 				// to be dropped into the pretty formula for the final score.
 				int nameScore = nameScore(srcEnt.oldName, dstEnt.newName) * 100;

 				int score = (contentScore * 99 + nameScore * 1) / 10000;

 				if (score < renameScore) {
 					pm.update(1);
@@ -280,6 +287,53 @@ class SimilarityRenameDetector {
 		return mNext;
 	}

 	private int nameScore(String a, String b) {
 	    int aDirLen = a.lastIndexOf("/") + 1;
 	    int bDirLen = b.lastIndexOf("/") + 1;

 	    int dirMin = Math.min(aDirLen, bDirLen);
 	    int dirMax = Math.max(aDirLen, bDirLen);

 	    final int dirScoreLtr;
 	    final int dirScoreRtl;

 		if (dirMax == 0) {
 			dirScoreLtr = 100;
 			dirScoreRtl = 100;
 		} else {
 			int dirSim = 0;
 			for (; dirSim < dirMin; dirSim++) {
 				if (a.charAt(dirSim) != b.charAt(dirSim))
 					break;
 			}
 			dirScoreLtr = (dirSim * 100) / dirMax;

 			if (dirScoreLtr == 100) {
 				dirScoreRtl = 100;
 			} else {
 				for (dirSim = 0; dirSim < dirMin; dirSim++) {
 					if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
 							- dirSim))
 						break;
 				}
 				dirScoreRtl = (dirSim * 100) / dirMax;
 			}
 		}

 		int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
 		int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);

 		int fileSim = 0;
 		for (; fileSim < fileMin; fileSim++) {
 			if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
 					- fileSim))
 				break;
 		}
 		int fileScore = (fileSim * 100) / fileMax;

 		return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
 	}

 	private SimilarityIndex hash(ObjectId objectId) throws IOException {
 		SimilarityIndex r = new SimilarityIndex();
 		r.hash(repo.openObject(objectId));