diff options
author | Shawn O. Pearce <spearce@spearce.org> | 2010-07-02 17:29:09 -0700 |
---|---|---|
committer | Shawn O. Pearce <spearce@spearce.org> | 2010-07-03 16:32:03 -0700 |
commit | 978535b09080edcc01302e80b37b9e1263c21db3 (patch) | |
tree | 4925c496b30177b7a8e5103e71623f79f9cd2a0d /org.eclipse.jgit.test/tst/org/eclipse/jgit/diff | |
parent | cb8e1e6014c9cbac9c557df519ad7e22bdcf7d7d (diff) | |
download | jgit-978535b09080edcc01302e80b37b9e1263c21db3.tar.gz jgit-978535b09080edcc01302e80b37b9e1263c21db3.zip |
Implement similarity based rename detection
Content similarity based rename detection is performed only after
a linear time detection is performed using exact content match on
the ObjectIds. Any names which were paired up during that exact
match phase are excluded from the inexact similarity based rename,
which reduces the space that must be considered.
During rename detection two entries cannot be marked as a rename
if they are different types of files. This prevents a symlink from
being renamed to a regular file, even if their blob content appears
to be similar, or is identical.
Efficiently comparing two files is performed by building up two
hash indexes and hashing lines or short blocks from each file,
counting the number of bytes that each line or block represents.
Instead of using a standard java.util.HashMap, we use a custom
open hashing scheme similiar to what we use in ObjecIdSubclassMap.
This permits us to have a very light-weight hash, with very little
memory overhead per cell stored.
As we only need two ints per record in the map (line/block key and
number of bytes), we collapse them into a single long inside of
a long array, making very efficient use of available memory when
we create the index table. We only need object headers for the
index structure itself, and the index table, but not per-cell.
This offers a massive space savings over using java.util.HashMap.
The score calculation is done by approximating how many bytes are
the same between the two inputs (which for a delta would be how much
is copied from the base into the result). The score is derived by
dividing the approximate number of bytes in common into the length
of the larger of the two input files.
Right now the SimilarityIndex table should average about 1/2 full,
which means we waste about 50% of our memory on empty entries
after we are done indexing a file and sort the table's contents.
If memory becomes an issue we could discard the table and copy all
records over to a new array that is properly sized.
Building the index requires O(M + N log N) time, where M is the
size of the input file in bytes, and N is the number of unique
lines/blocks in the file. The N log N time constraint comes
from the sort of the index table that is necessary to perform
linear time matching against another SimilarityIndex created for
a different file.
To actually perform the rename detection, a SxD matrix is created,
placing the sources (aka deletions) along one dimension and the
destinations (aka additions) along the other. A simple O(S x D)
loop examines every cell in this matrix.
A SimilarityIndex is built along the row and reused for each
column compare along that row, avoiding the costly index rebuild
at the row level. A future improvement would be to load a smaller
square matrix into SimilarityIndexes and process everything in that
sub-matrix before discarding the column dimension and moving down
to the next sub-matrix block along that same grid of rows.
An optional ProgressMonitor is permitted to be passed in, allowing
applications to see the progress of the detector as it works through
the matrix cells. This provides some indication of current status
for very long running renames.
The default line/block hash function used by the SimilarityIndex
may not be optimal, and may produce too many collisions. It is
borrowed from RawText's hash, which is used to quickly skip out of
a longer equality test if two lines have different hash functions.
We may need to refine this hash in the future, in order to minimize
the number of collisions we get on common source files.
Based on a handful of test commits in JGit (especially my own
recent rename repository refactoring series), this rename detector
produces output that is very close to C Git. The content similarity
scores are sometimes off by 1%, which is most probably caused by
our SimilarityIndex type using a different hash function than C
Git uses when it computes the delta size between any two objects
in the rename matrix.
Bug: 318504
Change-Id: I11dff969e8a2e4cf252636d857d2113053bdd9dc
Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
Diffstat (limited to 'org.eclipse.jgit.test/tst/org/eclipse/jgit/diff')
-rw-r--r-- | org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java | 321 | ||||
-rw-r--r-- | org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java | 137 |
2 files changed, 340 insertions, 118 deletions
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java index 4f84066f2e..c4cb600db0 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java @@ -53,152 +53,237 @@ import org.eclipse.jgit.lib.ObjectId; import org.eclipse.jgit.lib.RepositoryTestCase; public class RenameDetectorTest extends RepositoryTestCase { + private static final String PATH_A = "src/A"; + private static final String PATH_H = "src/H"; + private static final String PATH_Q = "src/Q"; - RenameDetector rd; + private RenameDetector rd; - TestRepository testDb; + private TestRepository testDb; @Override public void setUp() throws Exception { super.setUp(); testDb = new TestRepository(db); - rd = new RenameDetector(); + rd = new RenameDetector(db); } - public void testGetEntriesAddDelete() throws Exception { - ObjectId foo = testDb.blob("foo").copy(); + public void testExactRename_OneRename() throws Exception { + ObjectId foo = blob("foo"); - DiffEntry a = new DiffEntry(); - a.newId = AbbreviatedObjectId.fromObjectId(foo); - a.newMode = FileMode.REGULAR_FILE; - a.newName = "some/file.c"; - a.changeType = ChangeType.ADD; + DiffEntry a = DiffEntry.add(PATH_A, foo); + DiffEntry b = DiffEntry.delete(PATH_Q, foo); - DiffEntry b = new DiffEntry(); - b.oldId = AbbreviatedObjectId.fromObjectId(foo); - b.oldMode = FileMode.REGULAR_FILE; - b.oldName = "some/other_file.c"; - b.changeType = ChangeType.DELETE; + rd.add(a); + rd.add(b); - rd.addDiffEntry(a); - rd.addDiffEntry(b); + List<DiffEntry> entries = rd.compute(); + assertEquals(1, entries.size()); + assertRename(b, a, 100, entries.get(0)); + } + + public void testExactRename_OneRenameOneModify() throws Exception { + ObjectId foo = blob("foo"); + ObjectId bar = blob("bar"); + + DiffEntry a = DiffEntry.add(PATH_A, foo); + DiffEntry b = DiffEntry.delete(PATH_Q, foo); + + DiffEntry c = DiffEntry.modify(PATH_H); + c.newId = c.oldId = AbbreviatedObjectId.fromObjectId(bar); + + rd.add(a); + rd.add(b); + rd.add(c); + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertRename(b, a, 100, entries.get(0)); + assertSame(c, entries.get(1)); + } + + public void testExactRename_ManyRenames() throws Exception { + ObjectId foo = blob("foo"); + ObjectId bar = blob("bar"); + + DiffEntry a = DiffEntry.add(PATH_A, foo); + DiffEntry b = DiffEntry.delete(PATH_Q, foo); + + DiffEntry c = DiffEntry.add("README", bar); + DiffEntry d = DiffEntry.delete("REEDME", bar); + + rd.add(a); + rd.add(b); + rd.add(c); + rd.add(d); + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertRename(d, c, 100, entries.get(0)); + assertRename(b, a, 100, entries.get(1)); + } + + public void testInexactRename_OnePair() throws Exception { + ObjectId aId = blob("foo\nbar\nbaz\n"); + ObjectId bId = blob("foo\nbar\nblah\n"); + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); - List<DiffEntry> entries = rd.getEntries(); + rd.add(a); + rd.add(b); + + List<DiffEntry> entries = rd.compute(); assertEquals(1, entries.size()); + assertRename(b, a, 61, entries.get(0)); + } + + public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception { + ObjectId aId = blob("foo\nbar\nbaz\n"); + ObjectId bId = blob("foo\nbar\nblah\n"); + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); + + ObjectId cId = blob("some\nsort\nof\ntext\n"); + ObjectId dId = blob("completely\nunrelated\ntext\n"); + DiffEntry c = DiffEntry.add("c", cId); + DiffEntry d = DiffEntry.delete("d", dId); - DiffEntry rename = entries.get(0); - assertNotNull(rename); - assertTrue(foo.equals(rename.newId.toObjectId())); - assertTrue(foo.equals(rename.oldId.toObjectId())); - assertEquals(FileMode.REGULAR_FILE, rename.newMode); - assertEquals(FileMode.REGULAR_FILE, rename.oldMode); - assertEquals(ChangeType.RENAME, rename.changeType); - assertEquals("some/file.c", rename.newName); - assertEquals("some/other_file.c", rename.oldName); + rd.add(a); + rd.add(b); + rd.add(c); + rd.add(d); + + List<DiffEntry> entries = rd.compute(); + assertEquals(3, entries.size()); + assertSame(c, entries.get(0)); + assertSame(d, entries.get(1)); + assertRename(b, a, 61, entries.get(2)); } - public void testGetEntriesAddDeleteModify() throws Exception { - ObjectId foo = testDb.blob("foo").copy(); - ObjectId bar = testDb.blob("bar").copy(); + public void testInexactRename_LastByteDifferent() throws Exception { + ObjectId aId = blob("foo\nbar\na"); + ObjectId bId = blob("foo\nbar\nb"); - DiffEntry a = new DiffEntry(); - a.newId = AbbreviatedObjectId.fromObjectId(foo); - a.newMode = FileMode.REGULAR_FILE; - a.newName = "some/file.c"; - a.changeType = ChangeType.ADD; + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); - DiffEntry b = new DiffEntry(); - b.oldId = AbbreviatedObjectId.fromObjectId(foo); - b.oldMode = FileMode.REGULAR_FILE; - b.oldName = "some/other_file.c"; - b.changeType = ChangeType.DELETE; + rd.add(a); + rd.add(b); - DiffEntry c = new DiffEntry(); - c.newId = c.oldId = AbbreviatedObjectId.fromObjectId(bar); - c.newMode = c.oldMode = FileMode.REGULAR_FILE; - c.newName = c.oldName = "some/header.h"; - c.changeType = ChangeType.MODIFY; + List<DiffEntry> entries = rd.compute(); + assertEquals(1, entries.size()); + assertRename(b, a, 88, entries.get(0)); + } + + public void testInexactRenames_OnePair2() throws Exception { + ObjectId aId = blob("ab\nab\nab\nac\nad\nae\n"); + ObjectId bId = blob("ac\nab\nab\nab\naa\na0\na1\n"); + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); + + rd.add(a); + rd.add(b); + rd.setRenameScore(50); - rd.addDiffEntry(a); - rd.addDiffEntry(b); - rd.addDiffEntry(c); + List<DiffEntry> entries = rd.compute(); + assertEquals(1, entries.size()); + assertRename(b, a, 57, entries.get(0)); + } + + public void testNoRenames_SingleByteFiles() throws Exception { + ObjectId aId = blob("a"); + ObjectId bId = blob("b"); + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); + + rd.add(a); + rd.add(b); + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertSame(a, entries.get(0)); + assertSame(b, entries.get(1)); + } + + public void testNoRenames_EmptyFile1() throws Exception { + ObjectId aId = blob(""); + DiffEntry a = DiffEntry.add(PATH_A, aId); + + rd.add(a); + + List<DiffEntry> entries = rd.compute(); + assertEquals(1, entries.size()); + assertSame(a, entries.get(0)); + } + + public void testNoRenames_EmptyFile2() throws Exception { + ObjectId aId = blob(""); + ObjectId bId = blob("blah"); + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); - List<DiffEntry> entries = rd.getEntries(); + rd.add(a); + rd.add(b); + + List<DiffEntry> entries = rd.compute(); assertEquals(2, entries.size()); + assertSame(a, entries.get(0)); + assertSame(b, entries.get(1)); + } + + public void testNoRenames_SymlinkAndFile() throws Exception { + ObjectId aId = blob("src/dest"); + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, aId); + b.oldMode = FileMode.SYMLINK; - // The renamed change should be first because the output should be - // sorted by newName - DiffEntry rename = entries.get(0); - assertNotNull(rename); - assertTrue(foo.equals(rename.newId.toObjectId())); - assertTrue(foo.equals(rename.oldId.toObjectId())); - assertEquals(FileMode.REGULAR_FILE, rename.newMode); - assertEquals(FileMode.REGULAR_FILE, rename.oldMode); - assertEquals(ChangeType.RENAME, rename.changeType); - assertEquals("some/file.c", rename.newName); - assertEquals("some/other_file.c", rename.oldName); - - DiffEntry modify = entries.get(1); - assertEquals(c, modify); + rd.add(a); + rd.add(b); + + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertSame(a, entries.get(0)); + assertSame(b, entries.get(1)); } - public void testGetEntriesMultipleRenames() throws Exception { - ObjectId foo = testDb.blob("foo").copy(); - ObjectId bar = testDb.blob("bar").copy(); - - DiffEntry a = new DiffEntry(); - a.newId = AbbreviatedObjectId.fromObjectId(foo); - a.newMode = FileMode.REGULAR_FILE; - a.newName = "some/file.c"; - a.changeType = ChangeType.ADD; - - DiffEntry b = new DiffEntry(); - b.oldId = AbbreviatedObjectId.fromObjectId(foo); - b.oldMode = FileMode.REGULAR_FILE; - b.oldName = "some/other_file.c"; - b.changeType = ChangeType.DELETE; - - DiffEntry c = new DiffEntry(); - c.newId = AbbreviatedObjectId.fromObjectId(bar); - c.newMode = FileMode.REGULAR_FILE; - c.newName = "README"; - c.changeType = ChangeType.ADD; - - DiffEntry d = new DiffEntry(); - d.oldId = AbbreviatedObjectId.fromObjectId(bar); - d.oldMode = FileMode.REGULAR_FILE; - d.oldName = "REEDME"; - d.changeType = ChangeType.DELETE; - - rd.addDiffEntry(a); - rd.addDiffEntry(b); - rd.addDiffEntry(c); - rd.addDiffEntry(d); - - List<DiffEntry> entries = rd.getEntries(); + public void testNoRenames_GitlinkAndFile() throws Exception { + ObjectId aId = blob("src/dest"); + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, aId); + b.oldMode = FileMode.GITLINK; + + rd.add(a); + rd.add(b); + + List<DiffEntry> entries = rd.compute(); assertEquals(2, entries.size()); + assertSame(a, entries.get(0)); + assertSame(b, entries.get(1)); + } - // The REEDME -> README renamed change should be first because the - // output should be sorted by newName - DiffEntry readme = entries.get(0); - assertNotNull(readme); - assertTrue(bar.equals(readme.newId.toObjectId())); - assertTrue(bar.equals(readme.oldId.toObjectId())); - assertEquals(FileMode.REGULAR_FILE, readme.newMode); - assertEquals(FileMode.REGULAR_FILE, readme.oldMode); - assertEquals(ChangeType.RENAME, readme.changeType); - assertEquals("README", readme.newName); - assertEquals("REEDME", readme.oldName); - - DiffEntry somefile = entries.get(1); - assertNotNull(somefile); - assertTrue(foo.equals(somefile.newId.toObjectId())); - assertTrue(foo.equals(somefile.oldId.toObjectId())); - assertEquals(FileMode.REGULAR_FILE, somefile.newMode); - assertEquals(FileMode.REGULAR_FILE, somefile.oldMode); - assertEquals(ChangeType.RENAME, somefile.changeType); - assertEquals("some/file.c", somefile.newName); - assertEquals("some/other_file.c", somefile.oldName); + private ObjectId blob(String content) throws Exception { + return testDb.blob(content).copy(); } + private static void assertRename(DiffEntry o, DiffEntry n, int score, + DiffEntry rename) { + assertEquals(ChangeType.RENAME, rename.getChangeType()); + + assertEquals(o.getOldName(), rename.getOldName()); + assertEquals(n.getNewName(), rename.getNewName()); + + assertEquals(o.getOldMode(), rename.getOldMode()); + assertEquals(n.getNewMode(), rename.getNewMode()); + + assertEquals(o.getOldId(), rename.getOldId()); + assertEquals(n.getNewId(), rename.getNewId()); + + assertEquals(score, rename.getScore()); + } } diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java new file mode 100644 index 0000000000..9ab745fac9 --- /dev/null +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/SimilarityIndexTest.java @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2010, Google Inc. + * and other copyright owners as documented in the project's IP log. + * + * This program and the accompanying materials are made available + * under the terms of the Eclipse Distribution License v1.0 which + * accompanies this distribution, is reproduced below, and is + * available at http://www.eclipse.org/org/documents/edl-v10.php + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * - Neither the name of the Eclipse Foundation, Inc. nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.eclipse.jgit.diff; + +import junit.framework.TestCase; + +import org.eclipse.jgit.lib.Constants; + +public class SimilarityIndexTest extends TestCase { + public void testIndexing() { + SimilarityIndex si = hash("" // + + "A\n" // + + "B\n" // + + "D\n" // + + "B\n" // + ); + + int key_A = keyFor("A\n"); + int key_B = keyFor("B\n"); + int key_D = keyFor("D\n"); + assertTrue(key_A != key_B && key_A != key_D && key_B != key_D); + + assertEquals(3, si.size()); + assertEquals(2, si.count(si.findIndex(key_A))); + assertEquals(4, si.count(si.findIndex(key_B))); + assertEquals(2, si.count(si.findIndex(key_D))); + } + + public void testCommonScore_SameFiles() { + String text = "" // + + "A\n" // + + "B\n" // + + "D\n" // + + "B\n"; + SimilarityIndex src = hash(text); + SimilarityIndex dst = hash(text); + assertEquals(8, src.common(dst)); + assertEquals(8, dst.common(src)); + + assertEquals(100, src.score(dst)); + assertEquals(100, dst.score(src)); + } + + public void testCommonScore_EmptyFiles() { + SimilarityIndex src = hash(""); + SimilarityIndex dst = hash(""); + assertEquals(0, src.common(dst)); + assertEquals(0, dst.common(src)); + } + + public void testCommonScore_TotallyDifferentFiles() { + SimilarityIndex src = hash("A\n"); + SimilarityIndex dst = hash("D\n"); + assertEquals(0, src.common(dst)); + assertEquals(0, dst.common(src)); + } + + public void testCommonScore_SimiliarBy75() { + SimilarityIndex src = hash("A\nB\nC\nD\n"); + SimilarityIndex dst = hash("A\nB\nC\nQ\n"); + assertEquals(6, src.common(dst)); + assertEquals(6, dst.common(src)); + + assertEquals(75, src.score(dst)); + assertEquals(75, dst.score(src)); + } + + private static SimilarityIndex hash(String text) { + SimilarityIndex src = new SimilarityIndex() { + @Override + void hash(byte[] raw, int ptr, final int end) { + while (ptr < end) { + int hash = raw[ptr] & 0xff; + int start = ptr; + do { + int c = raw[ptr++] & 0xff; + if (c == '\n') + break; + } while (ptr < end && ptr - start < 64); + add(hash, ptr - start); + } + } + }; + byte[] raw = Constants.encode(text); + src.setFileSize(raw.length); + src.hash(raw, 0, raw.length); + src.sort(); + return src; + } + + private static int keyFor(String line) { + SimilarityIndex si = hash(line); + assertEquals("single line scored", 1, si.size()); + return si.key(0); + } +} |