From 4a78d911c578a6f9028d6e74b5668dfc384ef80f Mon Sep 17 00:00:00 2001 From: Youssef Elghareeb Date: Thu, 11 Mar 2021 13:01:14 +0100 Subject: [PATCH] Skip detecting content renames for large files There are two code paths for detecting renames: one on tree diffs (using DiffFormatter#scan) and the other on single file diffs (using DiffFormatter#format). The latter skips binary and large files for rename detection - check [1], but the former doesn't. This change skips content rename detection for the tree diffs case for large files. This is essential to avoid expensive computations while reading the file, especially for callers who don't want to pay that cost. Content renames are those which involve files with slightly modified content. Exact renames will still be identified. The default threshold for file sizes is reused from PackConfig.DEFAULT_BIG_FILE_THRESHOLD: 50 MB. [1] https://git.eclipse.org/r/plugins/gitiles/jgit/jgit/+/232876421d067a1242e8afcaa33b9171342fee3e/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java#386 Change-Id: Idbc2c29bd381c6e387185204638f76fda47df41e Signed-off-by: Youssef Elghareeb --- .../eclipse/jgit/diff/RenameDetectorTest.java | 48 +++++++++++++++++++ .../org/eclipse/jgit/diff/RenameDetector.java | 28 +++++++++++ .../jgit/diff/SimilarityRenameDetector.java | 16 +++++++ 3 files changed, 92 insertions(+) diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java index 6203feda48..2ea3cd7ebe 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java @@ -542,6 +542,43 @@ public class RenameDetectorTest extends RepositoryTestCase { assertEquals(0, modify.score); } + @Test + public void testExactRename_LargeFile() throws Exception { + ObjectId aId = blob("blah\nblah\nfoo"); // size = 14 + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, aId); + + rd.add(a); + rd.add(b); + + // Exact renames are identified for large files + rd.setBigFileThreshold(10); + List entries = rd.compute(); + assertEquals(1, entries.size()); + assertRename(b, a, 100, entries.get(0)); + } + + @Test + public void testInexactRename_LargeFile() throws Exception { + ObjectId aId = blob("blah\nblah\nfoo"); // size = 14 + ObjectId bId = blob("bla\nblah\nfoo"); // size = 13 + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); + + rd.add(a); + rd.add(b); + + rd.setBigFileThreshold(10); + + // Inexact renames are not detected for large files + List entries = rd.compute(); + assertEquals(2, entries.size()); + assertAdd(PATH_A, aId, FileMode.REGULAR_FILE, entries.get(0)); + assertDelete(PATH_Q, bId, FileMode.REGULAR_FILE, entries.get(1)); + } + @Test public void testSetRenameScore_IllegalArgs() throws Exception { try { @@ -634,4 +671,15 @@ public class RenameDetectorTest extends RepositoryTestCase { assertEquals(AbbreviatedObjectId.fromObjectId(newId), add.newId); assertEquals(newMode, add.newMode); } + + private static void assertDelete(String oldName, ObjectId oldId, + FileMode oldMode, DiffEntry delete) { + assertEquals(DiffEntry.DEV_NULL, delete.newPath); + assertEquals(DiffEntry.A_ZERO, delete.newId); + assertEquals(FileMode.MISSING, delete.newMode); + assertEquals(ChangeType.DELETE, delete.changeType); + assertEquals(oldName, delete.oldPath); + assertEquals(AbbreviatedObjectId.fromObjectId(oldId), delete.oldId); + assertEquals(oldMode, delete.oldMode); + } } diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java index 80e1b18291..75784c2556 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java @@ -12,6 +12,7 @@ package org.eclipse.jgit.diff; import static org.eclipse.jgit.diff.DiffEntry.Side.NEW; import static org.eclipse.jgit.diff.DiffEntry.Side.OLD; +import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD; import java.io.IOException; import java.util.ArrayList; @@ -97,6 +98,12 @@ public class RenameDetector { /** Limit in the number of files to consider for renames. */ private int renameLimit; + /** + * File size threshold (in bytes) for detecting renames. Files larger + * than this size will not be processed for renames. + */ + private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD; + /** Set if the number of adds or deletes was over the limit. */ private boolean overRenameLimit; @@ -208,6 +215,26 @@ public class RenameDetector { renameLimit = limit; } + /** + * Get file size threshold for detecting renames. Files larger + * than this size will not be processed for rename detection. + * + * @return threshold in bytes of the file size. + * @since 5.12 + */ + public int getBigFileThreshold() { return bigFileThreshold; } + + /** + * Set the file size threshold for detecting renames. Files larger than this + * threshold will be skipped during rename detection computation. + * + * @param threshold file size threshold in bytes. + * @since 5.12 + */ + public void setBigFileThreshold(int threshold) { + this.bigFileThreshold = threshold; + } + /** * Check if the detector is over the rename limit. *

@@ -493,6 +520,7 @@ public class RenameDetector { d = new SimilarityRenameDetector(reader, deleted, added); d.setRenameScore(getRenameScore()); + d.setBigFileThreshold(getBigFileThreshold()); d.compute(pm); overRenameLimit |= d.isTableOverflow(); deleted = d.getLeftOverSources(); diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java index 74a11a024a..082f31d178 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java @@ -12,6 +12,7 @@ package org.eclipse.jgit.diff; import static org.eclipse.jgit.diff.DiffEntry.Side.NEW; import static org.eclipse.jgit.diff.DiffEntry.Side.OLD; +import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD; import java.io.IOException; import java.util.ArrayList; @@ -80,6 +81,12 @@ class SimilarityRenameDetector { /** Score a pair must exceed to be considered a rename. */ private int renameScore = 60; + /** + * File size threshold (in bytes) for detecting renames. Files larger + * than this size will not be processed for renames. + */ + private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD; + /** Set if any {@link SimilarityIndex.TableFullException} occurs. */ private boolean tableOverflow; @@ -96,6 +103,10 @@ class SimilarityRenameDetector { renameScore = score; } + void setBigFileThreshold(int threshold) { + bigFileThreshold = threshold; + } + void compute(ProgressMonitor pm) throws IOException, CancelledException { if (pm == null) pm = NullProgressMonitor.INSTANCE; @@ -253,6 +264,11 @@ class SimilarityRenameDetector { continue; } + if (max > bigFileThreshold) { + pm.update(1); + continue; + } + if (s == null) { try { s = hash(OLD, srcEnt); -- 2.39.5