Browse Source

Skip detecting content renames for large files

There are two code paths for detecting renames: one on tree diffs
(using DiffFormatter#scan) and the other on single file diffs (using
DiffFormatter#format). The latter skips binary and large files
for rename detection - check [1], but the former doesn't.

This change skips content rename detection for the tree diffs case for
large files. This is essential to avoid expensive computations while
reading the file, especially for callers who don't want to pay that
cost. Content renames are those which involve files with slightly
modified content. Exact renames will still be identified.

The default threshold for file sizes is reused from
PackConfig.DEFAULT_BIG_FILE_THRESHOLD: 50 MB.

[1] 232876421d/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java (386)

Change-Id: Idbc2c29bd381c6e387185204638f76fda47df41e
Signed-off-by: Youssef Elghareeb <ghareeb@google.com>
tags/v5.12.0.202105051250-m2
Youssef Elghareeb 3 years ago
parent
commit
4a78d911c5

+ 48
- 0
org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java View File

@@ -542,6 +542,43 @@ public class RenameDetectorTest extends RepositoryTestCase {
assertEquals(0, modify.score);
}

@Test
public void testExactRename_LargeFile() throws Exception {
ObjectId aId = blob("blah\nblah\nfoo"); // size = 14

DiffEntry a = DiffEntry.add(PATH_A, aId);
DiffEntry b = DiffEntry.delete(PATH_Q, aId);

rd.add(a);
rd.add(b);

// Exact renames are identified for large files
rd.setBigFileThreshold(10);
List<DiffEntry> entries = rd.compute();
assertEquals(1, entries.size());
assertRename(b, a, 100, entries.get(0));
}

@Test
public void testInexactRename_LargeFile() throws Exception {
ObjectId aId = blob("blah\nblah\nfoo"); // size = 14
ObjectId bId = blob("bla\nblah\nfoo"); // size = 13

DiffEntry a = DiffEntry.add(PATH_A, aId);
DiffEntry b = DiffEntry.delete(PATH_Q, bId);

rd.add(a);
rd.add(b);

rd.setBigFileThreshold(10);

// Inexact renames are not detected for large files
List<DiffEntry> entries = rd.compute();
assertEquals(2, entries.size());
assertAdd(PATH_A, aId, FileMode.REGULAR_FILE, entries.get(0));
assertDelete(PATH_Q, bId, FileMode.REGULAR_FILE, entries.get(1));
}

@Test
public void testSetRenameScore_IllegalArgs() throws Exception {
try {
@@ -634,4 +671,15 @@ public class RenameDetectorTest extends RepositoryTestCase {
assertEquals(AbbreviatedObjectId.fromObjectId(newId), add.newId);
assertEquals(newMode, add.newMode);
}

private static void assertDelete(String oldName, ObjectId oldId,
FileMode oldMode, DiffEntry delete) {
assertEquals(DiffEntry.DEV_NULL, delete.newPath);
assertEquals(DiffEntry.A_ZERO, delete.newId);
assertEquals(FileMode.MISSING, delete.newMode);
assertEquals(ChangeType.DELETE, delete.changeType);
assertEquals(oldName, delete.oldPath);
assertEquals(AbbreviatedObjectId.fromObjectId(oldId), delete.oldId);
assertEquals(oldMode, delete.oldMode);
}
}

+ 28
- 0
org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java View File

@@ -12,6 +12,7 @@ package org.eclipse.jgit.diff;

import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;

import java.io.IOException;
import java.util.ArrayList;
@@ -97,6 +98,12 @@ public class RenameDetector {
/** Limit in the number of files to consider for renames. */
private int renameLimit;

/**
* File size threshold (in bytes) for detecting renames. Files larger
* than this size will not be processed for renames.
*/
private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;

/** Set if the number of adds or deletes was over the limit. */
private boolean overRenameLimit;

@@ -208,6 +215,26 @@ public class RenameDetector {
renameLimit = limit;
}

/**
* Get file size threshold for detecting renames. Files larger
* than this size will not be processed for rename detection.
*
* @return threshold in bytes of the file size.
* @since 5.12
*/
public int getBigFileThreshold() { return bigFileThreshold; }

/**
* Set the file size threshold for detecting renames. Files larger than this
* threshold will be skipped during rename detection computation.
*
* @param threshold file size threshold in bytes.
* @since 5.12
*/
public void setBigFileThreshold(int threshold) {
this.bigFileThreshold = threshold;
}

/**
* Check if the detector is over the rename limit.
* <p>
@@ -493,6 +520,7 @@ public class RenameDetector {

d = new SimilarityRenameDetector(reader, deleted, added);
d.setRenameScore(getRenameScore());
d.setBigFileThreshold(getBigFileThreshold());
d.compute(pm);
overRenameLimit |= d.isTableOverflow();
deleted = d.getLeftOverSources();

+ 16
- 0
org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java View File

@@ -12,6 +12,7 @@ package org.eclipse.jgit.diff;

import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;

import java.io.IOException;
import java.util.ArrayList;
@@ -80,6 +81,12 @@ class SimilarityRenameDetector {
/** Score a pair must exceed to be considered a rename. */
private int renameScore = 60;

/**
* File size threshold (in bytes) for detecting renames. Files larger
* than this size will not be processed for renames.
*/
private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;

/** Set if any {@link SimilarityIndex.TableFullException} occurs. */
private boolean tableOverflow;

@@ -96,6 +103,10 @@ class SimilarityRenameDetector {
renameScore = score;
}

void setBigFileThreshold(int threshold) {
bigFileThreshold = threshold;
}

void compute(ProgressMonitor pm) throws IOException, CancelledException {
if (pm == null)
pm = NullProgressMonitor.INSTANCE;
@@ -253,6 +264,11 @@ class SimilarityRenameDetector {
continue;
}

if (max > bigFileThreshold) {
pm.update(1);
continue;
}

if (s == null) {
try {
s = hash(OLD, srcEnt);

Loading…
Cancel
Save