import org.eclipse.jgit.lib.RepositoryTestCase;
public class RenameDetectorTest extends RepositoryTestCase {
+ private static final String PATH_A = "src/A";
+ private static final String PATH_H = "src/H";
+ private static final String PATH_Q = "src/Q";
- RenameDetector rd;
+ private RenameDetector rd;
- TestRepository testDb;
+ private TestRepository testDb;
@Override
public void setUp() throws Exception {
super.setUp();
testDb = new TestRepository(db);
- rd = new RenameDetector();
+ rd = new RenameDetector(db);
}
- public void testGetEntriesAddDelete() throws Exception {
- ObjectId foo = testDb.blob("foo").copy();
+ public void testExactRename_OneRename() throws Exception {
+ ObjectId foo = blob("foo");
- DiffEntry a = new DiffEntry();
- a.newId = AbbreviatedObjectId.fromObjectId(foo);
- a.newMode = FileMode.REGULAR_FILE;
- a.newName = "some/file.c";
- a.changeType = ChangeType.ADD;
+ DiffEntry a = DiffEntry.add(PATH_A, foo);
+ DiffEntry b = DiffEntry.delete(PATH_Q, foo);
- DiffEntry b = new DiffEntry();
- b.oldId = AbbreviatedObjectId.fromObjectId(foo);
- b.oldMode = FileMode.REGULAR_FILE;
- b.oldName = "some/other_file.c";
- b.changeType = ChangeType.DELETE;
+ rd.add(a);
+ rd.add(b);
- rd.addDiffEntry(a);
- rd.addDiffEntry(b);
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(1, entries.size());
+ assertRename(b, a, 100, entries.get(0));
+ }
+
+ public void testExactRename_OneRenameOneModify() throws Exception {
+ ObjectId foo = blob("foo");
+ ObjectId bar = blob("bar");
+
+ DiffEntry a = DiffEntry.add(PATH_A, foo);
+ DiffEntry b = DiffEntry.delete(PATH_Q, foo);
+
+ DiffEntry c = DiffEntry.modify(PATH_H);
+ c.newId = c.oldId = AbbreviatedObjectId.fromObjectId(bar);
+
+ rd.add(a);
+ rd.add(b);
+ rd.add(c);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertRename(b, a, 100, entries.get(0));
+ assertSame(c, entries.get(1));
+ }
+
+ public void testExactRename_ManyRenames() throws Exception {
+ ObjectId foo = blob("foo");
+ ObjectId bar = blob("bar");
+
+ DiffEntry a = DiffEntry.add(PATH_A, foo);
+ DiffEntry b = DiffEntry.delete(PATH_Q, foo);
+
+ DiffEntry c = DiffEntry.add("README", bar);
+ DiffEntry d = DiffEntry.delete("REEDME", bar);
+
+ rd.add(a);
+ rd.add(b);
+ rd.add(c);
+ rd.add(d);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertRename(d, c, 100, entries.get(0));
+ assertRename(b, a, 100, entries.get(1));
+ }
+
+ public void testInexactRename_OnePair() throws Exception {
+ ObjectId aId = blob("foo\nbar\nbaz\n");
+ ObjectId bId = blob("foo\nbar\nblah\n");
+
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, bId);
- List<DiffEntry> entries = rd.getEntries();
+ rd.add(a);
+ rd.add(b);
+
+ List<DiffEntry> entries = rd.compute();
assertEquals(1, entries.size());
+ assertRename(b, a, 61, entries.get(0));
+ }
+
+ public void testInexactRename_OneRenameTwoUnrelatedFiles() throws Exception {
+ ObjectId aId = blob("foo\nbar\nbaz\n");
+ ObjectId bId = blob("foo\nbar\nblah\n");
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, bId);
+
+ ObjectId cId = blob("some\nsort\nof\ntext\n");
+ ObjectId dId = blob("completely\nunrelated\ntext\n");
+ DiffEntry c = DiffEntry.add("c", cId);
+ DiffEntry d = DiffEntry.delete("d", dId);
- DiffEntry rename = entries.get(0);
- assertNotNull(rename);
- assertTrue(foo.equals(rename.newId.toObjectId()));
- assertTrue(foo.equals(rename.oldId.toObjectId()));
- assertEquals(FileMode.REGULAR_FILE, rename.newMode);
- assertEquals(FileMode.REGULAR_FILE, rename.oldMode);
- assertEquals(ChangeType.RENAME, rename.changeType);
- assertEquals("some/file.c", rename.newName);
- assertEquals("some/other_file.c", rename.oldName);
+ rd.add(a);
+ rd.add(b);
+ rd.add(c);
+ rd.add(d);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(3, entries.size());
+ assertSame(c, entries.get(0));
+ assertSame(d, entries.get(1));
+ assertRename(b, a, 61, entries.get(2));
}
- public void testGetEntriesAddDeleteModify() throws Exception {
- ObjectId foo = testDb.blob("foo").copy();
- ObjectId bar = testDb.blob("bar").copy();
+ public void testInexactRename_LastByteDifferent() throws Exception {
+ ObjectId aId = blob("foo\nbar\na");
+ ObjectId bId = blob("foo\nbar\nb");
- DiffEntry a = new DiffEntry();
- a.newId = AbbreviatedObjectId.fromObjectId(foo);
- a.newMode = FileMode.REGULAR_FILE;
- a.newName = "some/file.c";
- a.changeType = ChangeType.ADD;
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, bId);
- DiffEntry b = new DiffEntry();
- b.oldId = AbbreviatedObjectId.fromObjectId(foo);
- b.oldMode = FileMode.REGULAR_FILE;
- b.oldName = "some/other_file.c";
- b.changeType = ChangeType.DELETE;
+ rd.add(a);
+ rd.add(b);
- DiffEntry c = new DiffEntry();
- c.newId = c.oldId = AbbreviatedObjectId.fromObjectId(bar);
- c.newMode = c.oldMode = FileMode.REGULAR_FILE;
- c.newName = c.oldName = "some/header.h";
- c.changeType = ChangeType.MODIFY;
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(1, entries.size());
+ assertRename(b, a, 88, entries.get(0));
+ }
+
+ public void testInexactRenames_OnePair2() throws Exception {
+ ObjectId aId = blob("ab\nab\nab\nac\nad\nae\n");
+ ObjectId bId = blob("ac\nab\nab\nab\naa\na0\na1\n");
+
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, bId);
+
+ rd.add(a);
+ rd.add(b);
+ rd.setRenameScore(50);
- rd.addDiffEntry(a);
- rd.addDiffEntry(b);
- rd.addDiffEntry(c);
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(1, entries.size());
+ assertRename(b, a, 57, entries.get(0));
+ }
+
+ public void testNoRenames_SingleByteFiles() throws Exception {
+ ObjectId aId = blob("a");
+ ObjectId bId = blob("b");
+
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, bId);
+
+ rd.add(a);
+ rd.add(b);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertSame(a, entries.get(0));
+ assertSame(b, entries.get(1));
+ }
+
+ public void testNoRenames_EmptyFile1() throws Exception {
+ ObjectId aId = blob("");
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+
+ rd.add(a);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(1, entries.size());
+ assertSame(a, entries.get(0));
+ }
+
+ public void testNoRenames_EmptyFile2() throws Exception {
+ ObjectId aId = blob("");
+ ObjectId bId = blob("blah");
+
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, bId);
- List<DiffEntry> entries = rd.getEntries();
+ rd.add(a);
+ rd.add(b);
+
+ List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
+ assertSame(a, entries.get(0));
+ assertSame(b, entries.get(1));
+ }
+
+ public void testNoRenames_SymlinkAndFile() throws Exception {
+ ObjectId aId = blob("src/dest");
+
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, aId);
+ b.oldMode = FileMode.SYMLINK;
- // The renamed change should be first because the output should be
- // sorted by newName
- DiffEntry rename = entries.get(0);
- assertNotNull(rename);
- assertTrue(foo.equals(rename.newId.toObjectId()));
- assertTrue(foo.equals(rename.oldId.toObjectId()));
- assertEquals(FileMode.REGULAR_FILE, rename.newMode);
- assertEquals(FileMode.REGULAR_FILE, rename.oldMode);
- assertEquals(ChangeType.RENAME, rename.changeType);
- assertEquals("some/file.c", rename.newName);
- assertEquals("some/other_file.c", rename.oldName);
-
- DiffEntry modify = entries.get(1);
- assertEquals(c, modify);
+ rd.add(a);
+ rd.add(b);
+
+ List<DiffEntry> entries = rd.compute();
+ assertEquals(2, entries.size());
+ assertSame(a, entries.get(0));
+ assertSame(b, entries.get(1));
}
- public void testGetEntriesMultipleRenames() throws Exception {
- ObjectId foo = testDb.blob("foo").copy();
- ObjectId bar = testDb.blob("bar").copy();
-
- DiffEntry a = new DiffEntry();
- a.newId = AbbreviatedObjectId.fromObjectId(foo);
- a.newMode = FileMode.REGULAR_FILE;
- a.newName = "some/file.c";
- a.changeType = ChangeType.ADD;
-
- DiffEntry b = new DiffEntry();
- b.oldId = AbbreviatedObjectId.fromObjectId(foo);
- b.oldMode = FileMode.REGULAR_FILE;
- b.oldName = "some/other_file.c";
- b.changeType = ChangeType.DELETE;
-
- DiffEntry c = new DiffEntry();
- c.newId = AbbreviatedObjectId.fromObjectId(bar);
- c.newMode = FileMode.REGULAR_FILE;
- c.newName = "README";
- c.changeType = ChangeType.ADD;
-
- DiffEntry d = new DiffEntry();
- d.oldId = AbbreviatedObjectId.fromObjectId(bar);
- d.oldMode = FileMode.REGULAR_FILE;
- d.oldName = "REEDME";
- d.changeType = ChangeType.DELETE;
-
- rd.addDiffEntry(a);
- rd.addDiffEntry(b);
- rd.addDiffEntry(c);
- rd.addDiffEntry(d);
-
- List<DiffEntry> entries = rd.getEntries();
+ public void testNoRenames_GitlinkAndFile() throws Exception {
+ ObjectId aId = blob("src/dest");
+
+ DiffEntry a = DiffEntry.add(PATH_A, aId);
+ DiffEntry b = DiffEntry.delete(PATH_Q, aId);
+ b.oldMode = FileMode.GITLINK;
+
+ rd.add(a);
+ rd.add(b);
+
+ List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
+ assertSame(a, entries.get(0));
+ assertSame(b, entries.get(1));
+ }
- // The REEDME -> README renamed change should be first because the
- // output should be sorted by newName
- DiffEntry readme = entries.get(0);
- assertNotNull(readme);
- assertTrue(bar.equals(readme.newId.toObjectId()));
- assertTrue(bar.equals(readme.oldId.toObjectId()));
- assertEquals(FileMode.REGULAR_FILE, readme.newMode);
- assertEquals(FileMode.REGULAR_FILE, readme.oldMode);
- assertEquals(ChangeType.RENAME, readme.changeType);
- assertEquals("README", readme.newName);
- assertEquals("REEDME", readme.oldName);
-
- DiffEntry somefile = entries.get(1);
- assertNotNull(somefile);
- assertTrue(foo.equals(somefile.newId.toObjectId()));
- assertTrue(foo.equals(somefile.oldId.toObjectId()));
- assertEquals(FileMode.REGULAR_FILE, somefile.newMode);
- assertEquals(FileMode.REGULAR_FILE, somefile.oldMode);
- assertEquals(ChangeType.RENAME, somefile.changeType);
- assertEquals("some/file.c", somefile.newName);
- assertEquals("some/other_file.c", somefile.oldName);
+ private ObjectId blob(String content) throws Exception {
+ return testDb.blob(content).copy();
}
+ private static void assertRename(DiffEntry o, DiffEntry n, int score,
+ DiffEntry rename) {
+ assertEquals(ChangeType.RENAME, rename.getChangeType());
+
+ assertEquals(o.getOldName(), rename.getOldName());
+ assertEquals(n.getNewName(), rename.getNewName());
+
+ assertEquals(o.getOldMode(), rename.getOldMode());
+ assertEquals(n.getNewMode(), rename.getNewMode());
+
+ assertEquals(o.getOldId(), rename.getOldId());
+ assertEquals(n.getNewId(), rename.getNewId());
+
+ assertEquals(score, rename.getScore());
+ }
}
--- /dev/null
+/*
+ * Copyright (C) 2010, Google Inc.
+ * and other copyright owners as documented in the project's IP log.
+ *
+ * This program and the accompanying materials are made available
+ * under the terms of the Eclipse Distribution License v1.0 which
+ * accompanies this distribution, is reproduced below, and is
+ * available at http://www.eclipse.org/org/documents/edl-v10.php
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * - Neither the name of the Eclipse Foundation, Inc. nor the
+ * names of its contributors may be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.eclipse.jgit.diff;
+
+import junit.framework.TestCase;
+
+import org.eclipse.jgit.lib.Constants;
+
+public class SimilarityIndexTest extends TestCase {
+ public void testIndexing() {
+ SimilarityIndex si = hash("" //
+ + "A\n" //
+ + "B\n" //
+ + "D\n" //
+ + "B\n" //
+ );
+
+ int key_A = keyFor("A\n");
+ int key_B = keyFor("B\n");
+ int key_D = keyFor("D\n");
+ assertTrue(key_A != key_B && key_A != key_D && key_B != key_D);
+
+ assertEquals(3, si.size());
+ assertEquals(2, si.count(si.findIndex(key_A)));
+ assertEquals(4, si.count(si.findIndex(key_B)));
+ assertEquals(2, si.count(si.findIndex(key_D)));
+ }
+
+ public void testCommonScore_SameFiles() {
+ String text = "" //
+ + "A\n" //
+ + "B\n" //
+ + "D\n" //
+ + "B\n";
+ SimilarityIndex src = hash(text);
+ SimilarityIndex dst = hash(text);
+ assertEquals(8, src.common(dst));
+ assertEquals(8, dst.common(src));
+
+ assertEquals(100, src.score(dst));
+ assertEquals(100, dst.score(src));
+ }
+
+ public void testCommonScore_EmptyFiles() {
+ SimilarityIndex src = hash("");
+ SimilarityIndex dst = hash("");
+ assertEquals(0, src.common(dst));
+ assertEquals(0, dst.common(src));
+ }
+
+ public void testCommonScore_TotallyDifferentFiles() {
+ SimilarityIndex src = hash("A\n");
+ SimilarityIndex dst = hash("D\n");
+ assertEquals(0, src.common(dst));
+ assertEquals(0, dst.common(src));
+ }
+
+ public void testCommonScore_SimiliarBy75() {
+ SimilarityIndex src = hash("A\nB\nC\nD\n");
+ SimilarityIndex dst = hash("A\nB\nC\nQ\n");
+ assertEquals(6, src.common(dst));
+ assertEquals(6, dst.common(src));
+
+ assertEquals(75, src.score(dst));
+ assertEquals(75, dst.score(src));
+ }
+
+ private static SimilarityIndex hash(String text) {
+ SimilarityIndex src = new SimilarityIndex() {
+ @Override
+ void hash(byte[] raw, int ptr, final int end) {
+ while (ptr < end) {
+ int hash = raw[ptr] & 0xff;
+ int start = ptr;
+ do {
+ int c = raw[ptr++] & 0xff;
+ if (c == '\n')
+ break;
+ } while (ptr < end && ptr - start < 64);
+ add(hash, ptr - start);
+ }
+ }
+ };
+ byte[] raw = Constants.encode(text);
+ src.setFileSize(raw.length);
+ src.hash(raw, 0, raw.length);
+ src.sort();
+ return src;
+ }
+
+ private static int keyFor(String line) {
+ SimilarityIndex si = hash(line);
+ assertEquals("single line scored", 1, si.size());
+ return si.key(0);
+ }
+}
import junit.framework.TestCase;
+import org.eclipse.jgit.diff.DiffEntry;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.FileMode;
import org.eclipse.jgit.lib.ObjectId;
assertParse(fh);
assertEquals("/dev/null", fh.getOldName());
- assertSame(FileHeader.DEV_NULL, fh.getOldName());
+ assertSame(DiffEntry.DEV_NULL, fh.getOldName());
assertEquals("\u00c5ngstr\u00f6m", fh.getNewName());
assertSame(FileHeader.ChangeType.ADD, fh.getChangeType());
assertEquals("\u00c5ngstr\u00f6m", fh.getOldName());
assertEquals("/dev/null", fh.getNewName());
- assertSame(FileHeader.DEV_NULL, fh.getNewName());
+ assertSame(DiffEntry.DEV_NULL, fh.getNewName());
assertSame(FileHeader.ChangeType.DELETE, fh.getChangeType());
assertSame(FileHeader.PatchType.UNIFIED, fh.getPatchType());
import java.io.IOException;
import java.io.InputStream;
-import org.eclipse.jgit.lib.FileMode;
-
import junit.framework.TestCase;
+import org.eclipse.jgit.diff.DiffEntry;
+import org.eclipse.jgit.lib.FileMode;
+
public class PatchCcTest extends TestCase {
public void testParse_OneFileCc() throws IOException {
final Patch p = parseTestPatchFile();
final CombinedFileHeader cfh = (CombinedFileHeader) p.getFiles().get(0);
- assertSame(FileHeader.DEV_NULL, cfh.getOldName());
+ assertSame(DiffEntry.DEV_NULL, cfh.getOldName());
assertEquals("d", cfh.getNewName());
assertEquals(187, cfh.startOffset);
final CombinedFileHeader cfh = (CombinedFileHeader) p.getFiles().get(0);
assertEquals("a", cfh.getOldName());
- assertSame(FileHeader.DEV_NULL, cfh.getNewName());
+ assertSame(DiffEntry.DEV_NULL, cfh.getNewName());
assertEquals(187, cfh.startOffset);
remoteHungUpUnexpectedly=remote hung up unexpectedly
remoteNameCantBeNull=Remote name can't be null.
renamesAlreadyFound=Renames have already been found.
+renamesFindingByContent=Finding renames by content similarity
+renamesFindingExact=Finding exact renames
repositoryAlreadyExists=Repository already exists: {0}
repositoryNotFound=repository not found: {0}
requiredHashFunctionNotAvailable=Required hash function {0} not available.
shortReadOfBlock=Short read of block.
shortReadOfOptionalDIRCExtensionExpectedAnotherBytes=Short read of optional DIRC extension {0}; expected another {1} bytes within the section.
shortSkipOfBlock=Short skip of block.
+similarityScoreMustBeWithinBounds=Similarity score must be between 0 and 100.
smartHTTPPushDisabled=smart HTTP push disabled
sourceDestinationMustMatch=Source/Destination must match.
sourceIsNotAWildcard=Source is not a wildcard.
/***/ public String remoteHungUpUnexpectedly;
/***/ public String remoteNameCantBeNull;
/***/ public String renamesAlreadyFound;
+ /***/ public String renamesFindingByContent;
+ /***/ public String renamesFindingExact;
/***/ public String repositoryAlreadyExists;
/***/ public String repositoryNotFound;
/***/ public String requiredHashFunctionNotAvailable;
/***/ public String shortReadOfBlock;
/***/ public String shortReadOfOptionalDIRCExtensionExpectedAnotherBytes;
/***/ public String shortSkipOfBlock;
+ /***/ public String similarityScoreMustBeWithinBounds;
/***/ public String smartHTTPPushDisabled;
/***/ public String sourceDestinationMustMatch;
/***/ public String sourceIsNotAWildcard;
package org.eclipse.jgit.diff;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
import org.eclipse.jgit.lib.AbbreviatedObjectId;
+import org.eclipse.jgit.lib.AnyObjectId;
import org.eclipse.jgit.lib.FileMode;
+import org.eclipse.jgit.lib.MutableObjectId;
+import org.eclipse.jgit.lib.ObjectId;
+import org.eclipse.jgit.treewalk.TreeWalk;
/** A value class representing a change to a file */
public class DiffEntry {
+ private static final AbbreviatedObjectId A_ZERO = AbbreviatedObjectId
+ .fromObjectId(ObjectId.zeroId());
+
+ /** Magical file name used for file adds or deletes. */
+ public static final String DEV_NULL = "/dev/null";
/** General type of change a single file-level patch describes. */
public static enum ChangeType {
COPY;
}
+ /**
+ * Convert the TreeWalk into DiffEntry headers.
+ *
+ * @param walk
+ * the TreeWalk to walk through. Must have exactly two trees.
+ * @return headers describing the changed files.
+ * @throws IOException
+ * the repository cannot be accessed.
+ */
+ public static List<DiffEntry> scan(TreeWalk walk) throws IOException {
+ List<DiffEntry> r = new ArrayList<DiffEntry>();
+ MutableObjectId idBuf = new MutableObjectId();
+ while (walk.next()) {
+ DiffEntry entry = new DiffEntry();
+
+ walk.getObjectId(idBuf, 0);
+ entry.oldId = AbbreviatedObjectId.fromObjectId(idBuf);
+
+ walk.getObjectId(idBuf, 1);
+ entry.newId = AbbreviatedObjectId.fromObjectId(idBuf);
+
+ entry.oldMode = walk.getFileMode(0);
+ entry.newMode = walk.getFileMode(1);
+ entry.newName = entry.oldName = walk.getPathString();
+
+ if (entry.oldMode == FileMode.MISSING) {
+ entry.oldName = DiffEntry.DEV_NULL;
+ entry.changeType = ChangeType.ADD;
+ r.add(entry);
+
+ } else if (entry.newMode == FileMode.MISSING) {
+ entry.newName = DiffEntry.DEV_NULL;
+ entry.changeType = ChangeType.DELETE;
+ r.add(entry);
+
+ } else {
+ entry.changeType = ChangeType.MODIFY;
+ if (RenameDetector.sameType(entry.oldMode, entry.newMode))
+ r.add(entry);
+ else
+ r.addAll(breakModify(entry));
+ }
+ }
+ return r;
+ }
+
+ static DiffEntry add(String path, AnyObjectId id) {
+ DiffEntry e = new DiffEntry();
+ e.oldId = A_ZERO;
+ e.oldMode = FileMode.MISSING;
+ e.oldName = DEV_NULL;
+
+ e.newId = AbbreviatedObjectId.fromObjectId(id);
+ e.newMode = FileMode.REGULAR_FILE;
+ e.newName = path;
+ e.changeType = ChangeType.ADD;
+ return e;
+ }
+
+ static DiffEntry delete(String path, AnyObjectId id) {
+ DiffEntry e = new DiffEntry();
+ e.oldId = AbbreviatedObjectId.fromObjectId(id);
+ e.oldMode = FileMode.REGULAR_FILE;
+ e.oldName = path;
+
+ e.newId = A_ZERO;
+ e.newMode = FileMode.MISSING;
+ e.newName = DEV_NULL;
+ e.changeType = ChangeType.DELETE;
+ return e;
+ }
+
+ static DiffEntry modify(String path) {
+ DiffEntry e = new DiffEntry();
+ e.oldMode = FileMode.REGULAR_FILE;
+ e.oldName = path;
+
+ e.newMode = FileMode.REGULAR_FILE;
+ e.newName = path;
+ e.changeType = ChangeType.MODIFY;
+ return e;
+ }
+
+ static List<DiffEntry> breakModify(DiffEntry entry) {
+ DiffEntry del = new DiffEntry();
+ del.oldId = entry.getOldId();
+ del.oldMode = entry.getOldMode();
+ del.oldName = entry.getOldName();
+
+ del.newId = A_ZERO;
+ del.newMode = FileMode.MISSING;
+ del.newName = DiffEntry.DEV_NULL;
+ del.changeType = ChangeType.DELETE;
+
+ DiffEntry add = new DiffEntry();
+ add.oldId = A_ZERO;
+ add.oldMode = FileMode.MISSING;
+ add.oldName = DiffEntry.DEV_NULL;
+
+ add.newId = entry.getNewId();
+ add.newMode = entry.getNewMode();
+ add.newName = entry.getNewName();
+ add.changeType = ChangeType.ADD;
+ return Arrays.asList(del, add);
+ }
+
+ static DiffEntry pair(ChangeType changeType, DiffEntry src, DiffEntry dst,
+ int score) {
+ DiffEntry r = new DiffEntry();
+
+ r.oldId = src.oldId;
+ r.oldMode = src.oldMode;
+ r.oldName = src.oldName;
+
+ r.newId = dst.newId;
+ r.newMode = dst.newMode;
+ r.newName = dst.newName;
+
+ r.changeType = changeType;
+ r.score = score;
+
+ return r;
+ }
+
/** File name of the old (pre-image). */
protected String oldName;
public AbbreviatedObjectId getNewId() {
return newId;
}
-
}
\ No newline at end of file
import org.eclipse.jgit.JGitText;
import org.eclipse.jgit.diff.DiffEntry.ChangeType;
-import org.eclipse.jgit.errors.CorruptObjectException;
-import org.eclipse.jgit.errors.IncorrectObjectTypeException;
-import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.AbbreviatedObjectId;
+import org.eclipse.jgit.lib.Config;
import org.eclipse.jgit.lib.FileMode;
-import org.eclipse.jgit.lib.MutableObjectId;
-import org.eclipse.jgit.treewalk.TreeWalk;
+import org.eclipse.jgit.lib.NullProgressMonitor;
+import org.eclipse.jgit.lib.ProgressMonitor;
+import org.eclipse.jgit.lib.Repository;
/** Detect and resolve object renames. */
public class RenameDetector {
-
private static final int EXACT_RENAME_SCORE = 100;
private static final Comparator<DiffEntry> DIFF_COMPARATOR = new Comparator<DiffEntry>() {
- public int compare(DiffEntry o1, DiffEntry o2) {
- return o1.newName.compareTo(o2.newName);
+ public int compare(DiffEntry a, DiffEntry b) {
+ int cmp = nameOf(a).compareTo(nameOf(b));
+ if (cmp == 0)
+ cmp = sortOf(a.getChangeType()) - sortOf(b.getChangeType());
+ return cmp;
+ }
+
+ private String nameOf(DiffEntry ent) {
+ // Sort by the new name, unless the change is a delete. On
+ // deletes the new name is /dev/null, so we sort instead by
+ // the old name.
+ //
+ if (ent.changeType == ChangeType.DELETE)
+ return ent.oldName;
+ return ent.newName;
+ }
+
+ private int sortOf(ChangeType changeType) {
+ // Sort deletes before adds so that a major type change for
+ // a file path (such as symlink to regular file) will first
+ // remove the path, then add it back with the new type.
+ //
+ switch (changeType) {
+ case DELETE:
+ return 1;
+ case ADD:
+ return 2;
+ default:
+ return 10;
+ }
}
};
private List<DiffEntry> added = new ArrayList<DiffEntry>();
- private boolean done = false;
+ private boolean done;
+
+ private final Repository repo;
+
+ /** Similarity score required to pair an add/delete as a rename. */
+ private int renameScore = 60;
+
+ /** Limit in the number of files to consider for renames. */
+ private int renameLimit;
+
+ /** Set if the number of adds or deletes was over the limit. */
+ private boolean overRenameLimit;
+
+ /**
+ * Create a new rename detector for the given repository
+ *
+ * @param repo
+ * the repository to use for rename detection
+ */
+ public RenameDetector(Repository repo) {
+ this.repo = repo;
+
+ Config cfg = repo.getConfig();
+ renameLimit = cfg.getInt("diff", "renamelimit", 200);
+ }
/**
- * Walk through a given tree walk with exactly two trees and add all
- * differing files to the list of object to run rename detection on.
+ * @return minimum score required to pair an add/delete as a rename. The
+ * score ranges are within the bounds of (0, 100).
+ */
+ public int getRenameScore() {
+ return renameScore;
+ }
+
+ /**
+ * Set the minimum score required to pair an add/delete as a rename.
* <p>
- * The tree walk must have two trees attached to it, as well as a filter.
- * Calling this method after calling {@link #getEntries()} will result in an
- * {@link IllegalStateException}.
+ * When comparing two files together their score must be greater than or
+ * equal to the rename score for them to be considered a rename match. The
+ * score is computed based on content similarity, so a score of 60 implies
+ * that approximately 60% of the bytes in the files are identical.
*
- * @param walk
- * the TreeWalk to walk through. Must have exactly two trees.
+ * @param score
+ * new rename score, must be within (0, 100).
+ */
+ public void setRenameScore(int score) {
+ if (score < 0 || score > 100)
+ throw new IllegalArgumentException(
+ JGitText.get().similarityScoreMustBeWithinBounds);
+ renameScore = score;
+ }
+
+ /** @return limit on number of paths to perform inexact rename detection. */
+ public int getRenameLimit() {
+ return renameLimit;
+ }
+
+ /**
+ * Set the limit on the number of files to perform inexact rename detection.
+ * <p>
+ * The rename detector has to build a square matrix of the rename limit on
+ * each side, then perform that many file compares to determine similarity.
+ * If 1000 files are added, and 1000 files are deleted, a 1000*1000 matrix
+ * must be allocated, and 1,000,000 file compares may need to be performed.
+ *
+ * @param limit
+ * new file limit.
+ */
+ public void setRenameLimit(int limit) {
+ renameLimit = limit;
+ }
+
+ /**
+ * Check if the detector is over the rename limit.
+ * <p>
+ * This method can be invoked either before or after {@code getEntries} has
+ * been used to perform rename detection.
+ *
+ * @return true if the detector has more file additions or removals than the
+ * rename limit is currently set to. In such configurations the
+ * detector will skip expensive computation.
+ */
+ public boolean isOverRenameLimit() {
+ if (done)
+ return overRenameLimit;
+ int cnt = Math.max(added.size(), deleted.size());
+ return getRenameLimit() != 0 && getRenameLimit() < cnt;
+ }
+
+ /**
+ * Add entries to be considered for rename detection.
+ *
+ * @param entriesToAdd
+ * one or more entries to add.
* @throws IllegalStateException
- * the {@link #getEntries()} method has already been called for
- * this instance.
- * @throws MissingObjectException
- * {@link TreeWalk#isRecursive()} was enabled on the tree, a
- * subtree was found, but the subtree object does not exist in
- * this repository. The repository may be missing objects.
- * @throws IncorrectObjectTypeException
- * {@link TreeWalk#isRecursive()} was enabled on the tree, a
- * subtree was found, and the subtree id does not denote a tree,
- * but instead names some other non-tree type of object. The
- * repository may have data corruption.
- * @throws CorruptObjectException
- * the contents of a tree did not appear to be a tree. The
- * repository may have data corruption.
- * @throws IOException
- * a loose object or pack file could not be read.
+ * if {@code getEntries} was already invoked.
*/
- public void addTreeWalk(TreeWalk walk) throws MissingObjectException,
- IncorrectObjectTypeException, CorruptObjectException, IOException {
+ public void addAll(Collection<DiffEntry> entriesToAdd) {
if (done)
throw new IllegalStateException(JGitText.get().renamesAlreadyFound);
- MutableObjectId idBuf = new MutableObjectId();
- while (walk.next()) {
- DiffEntry entry = new DiffEntry();
- walk.getObjectId(idBuf, 0);
- entry.oldId = AbbreviatedObjectId.fromObjectId(idBuf);
- walk.getObjectId(idBuf, 1);
- entry.newId = AbbreviatedObjectId.fromObjectId(idBuf);
- entry.oldMode = walk.getFileMode(0);
- entry.newMode = walk.getFileMode(1);
- entry.newName = entry.oldName = walk.getPathString();
- if (entry.oldMode == FileMode.MISSING) {
- entry.changeType = ChangeType.ADD;
+
+ for (DiffEntry entry : entriesToAdd) {
+ switch (entry.getChangeType()) {
+ case ADD:
added.add(entry);
- } else if (entry.newMode == FileMode.MISSING) {
- entry.changeType = ChangeType.DELETE;
+ break;
+
+ case DELETE:
deleted.add(entry);
- } else {
- entry.changeType = ChangeType.MODIFY;
- entries.add(entry);
+ break;
+
+ case MODIFY:
+ if (sameType(entry.getOldMode(), entry.getNewMode()))
+ entries.add(entry);
+ else
+ entries.addAll(DiffEntry.breakModify(entry));
+ break;
+
+ case COPY:
+ case RENAME:
+ default:
+ entriesToAdd.add(entry);
}
}
}
/**
- * Add a DiffEntry to the list of items to run rename detection on. Calling
- * this method after calling {@link #getEntries()} will result in an
- * {@link IllegalStateException}.
+ * Add an entry to be considered for rename detection.
*
* @param entry
- * the {@link DiffEntry} to add
- *
+ * to add.
* @throws IllegalStateException
- * the {@link #getEntries()} method has already been called for
- * this instance
+ * if {@code getEntries} was already invoked.
*/
- public void addDiffEntry(DiffEntry entry) {
- if (done)
- throw new IllegalStateException(JGitText.get().renamesAlreadyFound);
- switch (entry.changeType) {
- case ADD:
- added.add(entry);
- break;
- case DELETE:
- deleted.add(entry);
- break;
- case COPY:
- case MODIFY:
- case RENAME:
- default:
- entries.add(entry);
- }
+ public void add(DiffEntry entry) {
+ addAll(Collections.singletonList(entry));
}
/**
- * Determines which files, if any, are renames, and returns an unmodifiable
- * list of {@link DiffEntry}s representing all files that have been changed
- * in some way. The list will contain all modified files first
+ * Detect renames in the current file set.
+ * <p>
+ * This convenience function runs without a progress monitor.
*
* @return an unmodifiable list of {@link DiffEntry}s representing all files
- * that have been changed
+ * that have been changed.
* @throws IOException
+ * file contents cannot be read from the repository.
*/
- public List<DiffEntry> getEntries() throws IOException {
+ public List<DiffEntry> compute() throws IOException {
+ return compute(NullProgressMonitor.INSTANCE);
+ }
+
+ /**
+ * Detect renames in the current file set.
+ *
+ * @param pm
+ * report progress during the detection phases.
+ * @return an unmodifiable list of {@link DiffEntry}s representing all files
+ * that have been changed.
+ * @throws IOException
+ * file contents cannot be read from the repository.
+ */
+ public List<DiffEntry> compute(ProgressMonitor pm) throws IOException {
if (!done) {
done = true;
- findExactRenames();
+
+ if (pm == null)
+ pm = NullProgressMonitor.INSTANCE;
+ findExactRenames(pm);
+ findContentRenames(pm);
+
entries.addAll(added);
- entries.addAll(deleted);
added = null;
+
+ entries.addAll(deleted);
deleted = null;
+
Collections.sort(entries, DIFF_COMPARATOR);
}
return Collections.unmodifiableList(entries);
}
+ private void findContentRenames(ProgressMonitor pm) throws IOException {
+ int cnt = Math.max(added.size(), deleted.size());
+ if (cnt == 0)
+ return;
+
+ if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
+ SimilarityRenameDetector d;
+
+ d = new SimilarityRenameDetector(repo, deleted, added);
+ d.setRenameScore(getRenameScore());
+ d.compute(pm);
+ deleted = d.getLeftOverSources();
+ added = d.getLeftOverDestinations();
+ entries.addAll(d.getMatches());
+ } else {
+ overRenameLimit = true;
+ }
+ }
+
@SuppressWarnings("unchecked")
- private void findExactRenames() {
- HashMap<AbbreviatedObjectId, Object> map = new HashMap<AbbreviatedObjectId, Object>();
+ private void findExactRenames(ProgressMonitor pm) {
+ if (added.isEmpty() || deleted.isEmpty())
+ return;
+ pm.beginTask(JGitText.get().renamesFindingExact, //
+ added.size() + deleted.size());
+
+ HashMap<AbbreviatedObjectId, Object> map = new HashMap<AbbreviatedObjectId, Object>();
for (DiffEntry del : deleted) {
Object old = map.put(del.oldId, del);
- if (old != null) {
- if (old instanceof DiffEntry) {
- ArrayList<DiffEntry> tmp = new ArrayList<DiffEntry>(2);
- tmp.add((DiffEntry) old);
- tmp.add(del);
- map.put(del.oldId, tmp);
- } else {
- // Must be a list of DiffEntrys
- ((List) old).add(del);
- map.put(del.oldId, old);
- }
+ if (old instanceof DiffEntry) {
+ ArrayList<DiffEntry> list = new ArrayList<DiffEntry>(2);
+ list.add((DiffEntry) old);
+ list.add(del);
+ map.put(del.oldId, list);
+
+ } else if (old != null) {
+ // Must be a list of DiffEntries
+ ((List) old).add(del);
+ map.put(del.oldId, old);
}
+ pm.update(1);
}
- ArrayList<DiffEntry> tempAdded = new ArrayList<DiffEntry>(added.size());
+ ArrayList<DiffEntry> left = new ArrayList<DiffEntry>(added.size());
+ for (DiffEntry dst : added) {
+ Object del = map.get(dst.newId);
+ if (del instanceof DiffEntry) {
+ DiffEntry e = (DiffEntry) del;
+ if (sameType(e.oldMode, dst.newMode)) {
+ if (e.changeType == ChangeType.DELETE) {
+ e.changeType = ChangeType.RENAME;
+ entries.add(exactRename(e, dst));
+ } else {
+ entries.add(exactCopy(e, dst));
+ }
+ } else {
+ left.add(dst);
+ }
- for (DiffEntry add : added) {
- Object del = map.remove(add.newId);
- if (del != null) {
- if (del instanceof DiffEntry) {
- entries.add(resolveRename(add, (DiffEntry) del,
- EXACT_RENAME_SCORE));
+ } else if (del != null) {
+ List<DiffEntry> list = (List<DiffEntry>) del;
+ DiffEntry best = null;
+ for (DiffEntry e : list) {
+ if (best == null && sameType(e.oldMode, dst.newMode))
+ best = e;
+ }
+ if (best != null) {
+ if (best.changeType == ChangeType.DELETE) {
+ best.changeType = ChangeType.RENAME;
+ entries.add(exactRename(best, dst));
+ } else {
+ entries.add(exactCopy(best, dst));
+ }
} else {
- // Must be a list of DiffEntrys
- List<DiffEntry> tmp = (List<DiffEntry>) del;
- entries.add(resolveRename(add, tmp.remove(0),
- EXACT_RENAME_SCORE));
- if (!tmp.isEmpty())
- map.put(add.newId, del);
+ left.add(dst);
}
+
} else {
- tempAdded.add(add);
+ left.add(dst);
}
+ pm.update(1);
}
- added = tempAdded;
-
- Collection<Object> values = map.values();
- ArrayList<DiffEntry> tempDeleted = new ArrayList<DiffEntry>(values
- .size());
- for (Object o : values) {
- if (o instanceof DiffEntry)
- tempDeleted.add((DiffEntry) o);
- else
- tempDeleted.addAll((List<DiffEntry>) o);
+ added = left;
+
+ deleted = new ArrayList<DiffEntry>(map.size());
+ for (Object o : map.values()) {
+ if (o instanceof DiffEntry) {
+ DiffEntry e = (DiffEntry) o;
+ if (e.changeType == ChangeType.DELETE)
+ deleted.add(e);
+ } else {
+ List<DiffEntry> list = (List<DiffEntry>) o;
+ for (DiffEntry e : list) {
+ if (e.changeType == ChangeType.DELETE)
+ deleted.add(e);
+ }
+ }
}
- deleted = tempDeleted;
+ pm.endTask();
}
- private DiffEntry resolveRename(DiffEntry add, DiffEntry del, int score) {
- DiffEntry renamed = new DiffEntry();
+ static boolean sameType(FileMode a, FileMode b) {
+ // Files have to be of the same type in order to rename them.
+ // We would never want to rename a file to a gitlink, or a
+ // symlink to a file.
+ //
+ int aType = a.getBits() & FileMode.TYPE_MASK;
+ int bType = b.getBits() & FileMode.TYPE_MASK;
+ return aType == bType;
+ }
- renamed.oldId = del.oldId;
- renamed.oldMode = del.oldMode;
- renamed.oldName = del.oldName;
- renamed.newId = add.newId;
- renamed.newMode = add.newMode;
- renamed.newName = add.newName;
- renamed.changeType = ChangeType.RENAME;
- renamed.score = score;
+ private static DiffEntry exactRename(DiffEntry src, DiffEntry dst) {
+ return DiffEntry.pair(ChangeType.RENAME, src, dst, EXACT_RENAME_SCORE);
+ }
- return renamed;
+ private static DiffEntry exactCopy(DiffEntry src, DiffEntry dst) {
+ return DiffEntry.pair(ChangeType.COPY, src, dst, EXACT_RENAME_SCORE);
}
}
--- /dev/null
+/*
+ * Copyright (C) 2010, Google Inc.
+ * and other copyright owners as documented in the project's IP log.
+ *
+ * This program and the accompanying materials are made available
+ * under the terms of the Eclipse Distribution License v1.0 which
+ * accompanies this distribution, is reproduced below, and is
+ * available at http://www.eclipse.org/org/documents/edl-v10.php
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * - Neither the name of the Eclipse Foundation, Inc. nor the
+ * names of its contributors may be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.eclipse.jgit.diff;
+
+import java.util.Arrays;
+
+import org.eclipse.jgit.lib.ObjectLoader;
+
+/**
+ * Index structure of lines/blocks in one file.
+ * <p>
+ * This structure can be used to compute an approximation of the similarity
+ * between two files. The index is used by {@link SimilarityRenameDetector} to
+ * compute scores between files.
+ * <p>
+ * To save space in memory, this index uses a space efficient encoding which
+ * will not exceed 1 MiB per instance. The index starts out at a smaller size
+ * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
+ * file are discovered.
+ */
+class SimilarityIndex {
+ /** The {@link #idHash} table stops growing at {@code 1 << MAX_HASH_BITS}. */
+ private static final int MAX_HASH_BITS = 17;
+
+ /** The {@link #idHash} table will not grow bigger than this, ever. */
+ private static final int MAX_HASH_SIZE = 1 << MAX_HASH_BITS;
+
+ /** Prime just before {@link #MAX_HASH_SIZE}. */
+ private static final int P = 131071;
+
+ /**
+ * Shift to apply before storing a key.
+ * <p>
+ * Within the 64 bit table record space, we leave the highest bit unset so
+ * all values are positive, and we need {@link #MAX_HASH_BITS} bits for the
+ * keys. The lower 32 bits are used to count bytes impacted.
+ */
+ private static final int KEY_SHIFT = 64 - 1 - MAX_HASH_BITS;
+
+ /** Total size of the file we hashed into the structure. */
+ private long fileSize;
+
+ /** Number of non-zero entries in {@link #idHash}. */
+ private int idSize;
+
+ /**
+ * Pairings of content keys and counters.
+ * <p>
+ * Slots in the table are actually two ints wedged into a single long. The
+ * upper {@link #MAX_HASH_BITS} bits stores the content key, and the
+ * remaining lower bits stores the number of bytes associated with that key.
+ * Empty slots are denoted by 0, which cannot occur because the count cannot
+ * be 0. Values can only be positive, which we enforce during key addition.
+ */
+ private long[] idHash;
+
+ SimilarityIndex() {
+ idHash = new long[256];
+ }
+
+ long getFileSize() {
+ return fileSize;
+ }
+
+ void setFileSize(long size) {
+ fileSize = size;
+ }
+
+ void hash(ObjectLoader obj) {
+ byte[] raw = obj.getCachedBytes();
+ setFileSize(raw.length);
+ hash(raw, 0, raw.length);
+ }
+
+ void hash(byte[] raw, int ptr, final int end) {
+ while (ptr < end) {
+ int hash = 5381;
+ int start = ptr;
+
+ // Hash one line, or one block, whichever occurs first.
+ do {
+ int c = raw[ptr++] & 0xff;
+ if (c == '\n')
+ break;
+ hash = (hash << 5) ^ c;
+ } while (ptr < end && ptr - start < 64);
+ add(hash, ptr - start);
+ }
+ }
+
+ /**
+ * Sort the internal table so it can be used for efficient scoring.
+ * <p>
+ * Once sorted, additional lines/blocks cannot be added to the index.
+ */
+ void sort() {
+ // Sort the array. All of the empty space will wind up at the front,
+ // because we forced all of the keys to always be positive. Later
+ // we only work with the back half of the array.
+ //
+ Arrays.sort(idHash);
+ }
+
+ int score(SimilarityIndex dst) {
+ long max = Math.max(fileSize, dst.fileSize);
+ return (int) ((common(dst) * 100L) / max);
+ }
+
+ int common(SimilarityIndex dst) {
+ return common(this, dst);
+ }
+
+ private static int common(SimilarityIndex src, SimilarityIndex dst) {
+ int srcIdx = src.packedIndex(0);
+ int dstIdx = dst.packedIndex(0);
+ long[] srcHash = src.idHash;
+ long[] dstHash = dst.idHash;
+ return common(srcHash, srcIdx, dstHash, dstIdx);
+ }
+
+ private static int common(long[] srcHash, int srcIdx, //
+ long[] dstHash, int dstIdx) {
+ if (srcIdx == srcHash.length || dstIdx == dstHash.length)
+ return 0;
+
+ int common = 0;
+ int srcKey = keyOf(srcHash[srcIdx]);
+ int dstKey = keyOf(dstHash[dstIdx]);
+
+ for (;;) {
+ if (srcKey == dstKey) {
+ common += countOf(dstHash[dstIdx]);
+
+ if (++srcIdx == srcHash.length)
+ break;
+ srcKey = keyOf(srcHash[srcIdx]);
+
+ if (++dstIdx == dstHash.length)
+ break;
+ dstKey = keyOf(dstHash[dstIdx]);
+
+ } else if (srcKey < dstKey) {
+ // Regions of src which do not appear in dst.
+ if (++srcIdx == srcHash.length)
+ break;
+ srcKey = keyOf(srcHash[srcIdx]);
+
+ } else /* if (srcKey > dstKey) */{
+ // Regions of dst which do not appear in dst.
+ if (++dstIdx == dstHash.length)
+ break;
+ dstKey = keyOf(dstHash[dstIdx]);
+ }
+ }
+
+ return common;
+ }
+
+ // Testing only
+ int size() {
+ return idSize;
+ }
+
+ // Testing only
+ int key(int idx) {
+ return keyOf(idHash[packedIndex(idx)]);
+ }
+
+ // Testing only
+ long count(int idx) {
+ return countOf(idHash[packedIndex(idx)]);
+ }
+
+ // Brute force approach only for testing.
+ int findIndex(int key) {
+ for (int i = 0; i < idSize; i++)
+ if (key(i) == key)
+ return i;
+ return -1;
+ }
+
+ private int packedIndex(int idx) {
+ return (idHash.length - idSize) + idx;
+ }
+
+ void add(int key, int cnt) {
+ key = hash(key);
+ int j = slot(key);
+ for (;;) {
+ long v = idHash[j];
+ if (v == 0) {
+ // Empty slot in the table, store here.
+ if (shouldGrow()) {
+ grow();
+ j = slot(key);
+ continue;
+ }
+ idHash[j] = (((long) key) << KEY_SHIFT) | cnt;
+ idSize++;
+ return;
+
+ } else if (keyOf(v) == key) {
+ // Same key, increment the counter.
+ idHash[j] = v + cnt;
+ return;
+
+ } else if (++j >= idHash.length) {
+ j = 0;
+ }
+ }
+ }
+
+ private static int hash(int key) {
+ // Make the key fit into our table. Since we have a maximum size
+ // that we cap the table at, all keys get squashed before going
+ // into the table. This prevents overflow.
+ //
+ return (key >>> 1) % P;
+ }
+
+ private int slot(int key) {
+ return key % idHash.length;
+ }
+
+ private boolean shouldGrow() {
+ int n = idHash.length;
+ return n < MAX_HASH_SIZE && n <= idSize * 2;
+ }
+
+ private void grow() {
+ long[] oldHash = idHash;
+ int oldSize = idHash.length;
+
+ idHash = new long[2 * oldSize];
+ for (int i = 0; i < oldSize; i++) {
+ long v = oldHash[i];
+ if (v != 0) {
+ int j = slot(keyOf(v));
+ while (idHash[j] != 0)
+ if (++j >= idHash.length)
+ j = 0;
+ idHash[j] = v;
+ }
+ }
+ }
+
+ private static int keyOf(long v) {
+ return (int) (v >>> KEY_SHIFT);
+ }
+
+ private static int countOf(long v) {
+ return (int) v;
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2010, Google Inc.
+ * and other copyright owners as documented in the project's IP log.
+ *
+ * This program and the accompanying materials are made available
+ * under the terms of the Eclipse Distribution License v1.0 which
+ * accompanies this distribution, is reproduced below, and is
+ * available at http://www.eclipse.org/org/documents/edl-v10.php
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * - Neither the name of the Eclipse Foundation, Inc. nor the
+ * names of its contributors may be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.eclipse.jgit.diff;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.eclipse.jgit.JGitText;
+import org.eclipse.jgit.diff.DiffEntry.ChangeType;
+import org.eclipse.jgit.lib.FileMode;
+import org.eclipse.jgit.lib.NullProgressMonitor;
+import org.eclipse.jgit.lib.ObjectId;
+import org.eclipse.jgit.lib.ProgressMonitor;
+import org.eclipse.jgit.lib.Repository;
+
+class SimilarityRenameDetector {
+ /**
+ * Number of bits we need to express an index into src or dst list.
+ * <p>
+ * This must be 28, giving us a limit of 2^28 entries in either list, which
+ * is an insane limit of 536,870,912 file names being considered in a single
+ * rename pass. The other 8 bits are used to store the score, while staying
+ * under 127 so the long doesn't go negative.
+ */
+ private static final int BITS_PER_INDEX = 28;
+
+ private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
+
+ private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
+
+ private final Repository repo;
+
+ /**
+ * All sources to consider for copies or renames.
+ * <p>
+ * A source is typically a {@link ChangeType#DELETE} change, but could be
+ * another type when trying to perform copy detection concurrently with
+ * rename detection.
+ */
+ private List<DiffEntry> srcs;
+
+ /**
+ * All destinations to consider looking for a rename.
+ * <p>
+ * A destination is typically an {@link ChangeType#ADD}, as the name has
+ * just come into existence, and we want to discover where its initial
+ * content came from.
+ */
+ private List<DiffEntry> dsts;
+
+ /**
+ * Matrix of all examined file pairs, and their scores.
+ * <p>
+ * The upper 8 bits of each long stores the score, but the score is bounded
+ * to be in the range (0, 128] so that the highest bit is never set, and all
+ * entries are therefore positive.
+ * <p>
+ * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
+ * as the lower two groups of 28 bits, respectively, but the encoding is
+ * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
+ * lower list indices later in the matrix, giving precedence to files whose
+ * names sort earlier in the tree.
+ */
+ private long[] matrix;
+
+ /** Score a pair must exceed to be considered a rename. */
+ private int renameScore = 60;
+
+ private List<DiffEntry> out;
+
+ SimilarityRenameDetector(Repository repo, List<DiffEntry> srcs,
+ List<DiffEntry> dsts) {
+ this.repo = repo;
+ this.srcs = srcs;
+ this.dsts = dsts;
+ }
+
+ void setRenameScore(int score) {
+ renameScore = score;
+ }
+
+ void compute(ProgressMonitor pm) throws IOException {
+ if (pm == null)
+ pm = NullProgressMonitor.INSTANCE;
+
+ pm.beginTask(JGitText.get().renamesFindingByContent, //
+ 2 * srcs.size() * dsts.size());
+
+ int mNext = buildMatrix(pm);
+ out = new ArrayList<DiffEntry>(Math.min(mNext, dsts.size()));
+
+ // Match rename pairs on a first come, first serve basis until
+ // we have looked at everything that is above our minimum score.
+ //
+ for (--mNext; mNext >= 0; mNext--) {
+ long ent = matrix[mNext];
+ int sIdx = srcFile(ent);
+ int dIdx = dstFile(ent);
+ DiffEntry s = srcs.get(sIdx);
+ DiffEntry d = dsts.get(dIdx);
+
+ if (d == null) {
+ pm.update(1);
+ continue; // was already matched earlier
+ }
+
+ ChangeType type;
+ if (s.changeType == ChangeType.DELETE) {
+ // First use of this source file. Tag it as a rename so we
+ // later know it is already been used as a rename, other
+ // matches (if any) will claim themselves as copies instead.
+ //
+ s.changeType = ChangeType.RENAME;
+ type = ChangeType.RENAME;
+ } else {
+ type = ChangeType.COPY;
+ }
+
+ out.add(DiffEntry.pair(type, s, d, score(ent)));
+ dsts.set(dIdx, null); // Claim the destination was matched.
+ pm.update(1);
+ }
+
+ srcs = compactSrcList(srcs);
+ dsts = compactDstList(dsts);
+ pm.endTask();
+ }
+
+ List<DiffEntry> getMatches() {
+ return out;
+ }
+
+ List<DiffEntry> getLeftOverSources() {
+ return srcs;
+ }
+
+ List<DiffEntry> getLeftOverDestinations() {
+ return dsts;
+ }
+
+ private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
+ ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
+ for (DiffEntry e : in) {
+ if (e.changeType == ChangeType.DELETE)
+ r.add(e);
+ }
+ return r;
+ }
+
+ private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
+ ArrayList<DiffEntry> r = new ArrayList<DiffEntry>(in.size());
+ for (DiffEntry e : in) {
+ if (e != null)
+ r.add(e);
+ }
+ return r;
+ }
+
+ private int buildMatrix(ProgressMonitor pm) throws IOException {
+ // Allocate for the worst-case scenario where every pair has a
+ // score that we need to consider. We might not need that many.
+ //
+ matrix = new long[srcs.size() * dsts.size()];
+
+ // Consider each pair of files, if the score is above the minimum
+ // threshold we need record that scoring in the matrix so we can
+ // later find the best matches.
+ //
+ int mNext = 0;
+ for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
+ DiffEntry srcEnt = srcs.get(srcIdx);
+ if (!isFile(srcEnt.oldMode)) {
+ pm.update(dsts.size());
+ continue;
+ }
+
+ SimilarityIndex s = hash(srcEnt.oldId.toObjectId());
+ for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
+ DiffEntry dstEnt = dsts.get(dstIdx);
+
+ if (!isFile(dstEnt.newMode)) {
+ pm.update(1);
+ continue;
+ }
+
+ if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
+ pm.update(1);
+ continue;
+ }
+
+ SimilarityIndex d = hash(dstEnt.newId.toObjectId());
+ int score = s.score(d);
+
+ if (score < renameScore) {
+ pm.update(1);
+ continue;
+ }
+
+ matrix[mNext++] = encode(score, srcIdx, dstIdx);
+ pm.update(1);
+ }
+ }
+
+ // Sort everything in the range we populated, which might be the
+ // entire matrix, or just a smaller slice if we had some bad low
+ // scoring pairs.
+ //
+ Arrays.sort(matrix, 0, mNext);
+ return mNext;
+ }
+
+ private SimilarityIndex hash(ObjectId objectId) throws IOException {
+ SimilarityIndex r = new SimilarityIndex();
+ r.hash(repo.openObject(objectId));
+ r.sort();
+ return r;
+ }
+
+ private static int score(long value) {
+ return (int) (value >>> SCORE_SHIFT);
+ }
+
+ private static int srcFile(long value) {
+ return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
+ }
+
+ private static int dstFile(long value) {
+ return decodeFile(((int) value) & INDEX_MASK);
+ }
+
+ private static long encode(int score, int srcIdx, int dstIdx) {
+ return (((long) score) << SCORE_SHIFT) //
+ | (encodeFile(srcIdx) << BITS_PER_INDEX) //
+ | encodeFile(dstIdx);
+ }
+
+ private static long encodeFile(int idx) {
+ // We invert the index so that the first file in the list sorts
+ // later in the table. This permits us to break ties favoring
+ // earlier names over later ones.
+ //
+ return INDEX_MASK - idx;
+ }
+
+ private static int decodeFile(int v) {
+ return INDEX_MASK - v;
+ }
+
+ private static boolean isFile(FileMode mode) {
+ return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
+ }
+}
/** Patch header describing an action for a single file path. */
public class FileHeader extends DiffEntry {
- /** Magical file name used for file adds or deletes. */
- public static final String DEV_NULL = "/dev/null";
-
private static final byte[] OLD_MODE = encodeASCII("old mode ");
private static final byte[] NEW_MODE = encodeASCII("new mode ");