diff options
author | Youssef Elghareeb <ghareeb@google.com> | 2021-03-11 13:01:14 +0100 |
---|---|---|
committer | Matthias Sohn <matthias.sohn@sap.com> | 2021-03-14 11:38:13 +0100 |
commit | 4a78d911c578a6f9028d6e74b5668dfc384ef80f (patch) | |
tree | 6dc5c597adab84b5043bad901fc9aa1456bbb314 | |
parent | 84ed57d2f63307437492f3a2c9b60ce7737b96e6 (diff) | |
download | jgit-4a78d911c578a6f9028d6e74b5668dfc384ef80f.tar.gz jgit-4a78d911c578a6f9028d6e74b5668dfc384ef80f.zip |
Skip detecting content renames for large files
There are two code paths for detecting renames: one on tree diffs
(using DiffFormatter#scan) and the other on single file diffs (using
DiffFormatter#format). The latter skips binary and large files
for rename detection - check [1], but the former doesn't.
This change skips content rename detection for the tree diffs case for
large files. This is essential to avoid expensive computations while
reading the file, especially for callers who don't want to pay that
cost. Content renames are those which involve files with slightly
modified content. Exact renames will still be identified.
The default threshold for file sizes is reused from
PackConfig.DEFAULT_BIG_FILE_THRESHOLD: 50 MB.
[1] https://git.eclipse.org/r/plugins/gitiles/jgit/jgit/+/232876421d067a1242e8afcaa33b9171342fee3e/org.eclipse.jgit/src/org/eclipse/jgit/diff/RawText.java#386
Change-Id: Idbc2c29bd381c6e387185204638f76fda47df41e
Signed-off-by: Youssef Elghareeb <ghareeb@google.com>
3 files changed, 92 insertions, 0 deletions
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java index 6203feda48..2ea3cd7ebe 100644 --- a/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/diff/RenameDetectorTest.java @@ -543,6 +543,43 @@ public class RenameDetectorTest extends RepositoryTestCase { } @Test + public void testExactRename_LargeFile() throws Exception { + ObjectId aId = blob("blah\nblah\nfoo"); // size = 14 + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, aId); + + rd.add(a); + rd.add(b); + + // Exact renames are identified for large files + rd.setBigFileThreshold(10); + List<DiffEntry> entries = rd.compute(); + assertEquals(1, entries.size()); + assertRename(b, a, 100, entries.get(0)); + } + + @Test + public void testInexactRename_LargeFile() throws Exception { + ObjectId aId = blob("blah\nblah\nfoo"); // size = 14 + ObjectId bId = blob("bla\nblah\nfoo"); // size = 13 + + DiffEntry a = DiffEntry.add(PATH_A, aId); + DiffEntry b = DiffEntry.delete(PATH_Q, bId); + + rd.add(a); + rd.add(b); + + rd.setBigFileThreshold(10); + + // Inexact renames are not detected for large files + List<DiffEntry> entries = rd.compute(); + assertEquals(2, entries.size()); + assertAdd(PATH_A, aId, FileMode.REGULAR_FILE, entries.get(0)); + assertDelete(PATH_Q, bId, FileMode.REGULAR_FILE, entries.get(1)); + } + + @Test public void testSetRenameScore_IllegalArgs() throws Exception { try { rd.setRenameScore(-1); @@ -634,4 +671,15 @@ public class RenameDetectorTest extends RepositoryTestCase { assertEquals(AbbreviatedObjectId.fromObjectId(newId), add.newId); assertEquals(newMode, add.newMode); } + + private static void assertDelete(String oldName, ObjectId oldId, + FileMode oldMode, DiffEntry delete) { + assertEquals(DiffEntry.DEV_NULL, delete.newPath); + assertEquals(DiffEntry.A_ZERO, delete.newId); + assertEquals(FileMode.MISSING, delete.newMode); + assertEquals(ChangeType.DELETE, delete.changeType); + assertEquals(oldName, delete.oldPath); + assertEquals(AbbreviatedObjectId.fromObjectId(oldId), delete.oldId); + assertEquals(oldMode, delete.oldMode); + } } diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java index 80e1b18291..75784c2556 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java @@ -12,6 +12,7 @@ package org.eclipse.jgit.diff; import static org.eclipse.jgit.diff.DiffEntry.Side.NEW; import static org.eclipse.jgit.diff.DiffEntry.Side.OLD; +import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD; import java.io.IOException; import java.util.ArrayList; @@ -97,6 +98,12 @@ public class RenameDetector { /** Limit in the number of files to consider for renames. */ private int renameLimit; + /** + * File size threshold (in bytes) for detecting renames. Files larger + * than this size will not be processed for renames. + */ + private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD; + /** Set if the number of adds or deletes was over the limit. */ private boolean overRenameLimit; @@ -209,6 +216,26 @@ public class RenameDetector { } /** + * Get file size threshold for detecting renames. Files larger + * than this size will not be processed for rename detection. + * + * @return threshold in bytes of the file size. + * @since 5.12 + */ + public int getBigFileThreshold() { return bigFileThreshold; } + + /** + * Set the file size threshold for detecting renames. Files larger than this + * threshold will be skipped during rename detection computation. + * + * @param threshold file size threshold in bytes. + * @since 5.12 + */ + public void setBigFileThreshold(int threshold) { + this.bigFileThreshold = threshold; + } + + /** * Check if the detector is over the rename limit. * <p> * This method can be invoked either before or after {@code getEntries} has @@ -493,6 +520,7 @@ public class RenameDetector { d = new SimilarityRenameDetector(reader, deleted, added); d.setRenameScore(getRenameScore()); + d.setBigFileThreshold(getBigFileThreshold()); d.compute(pm); overRenameLimit |= d.isTableOverflow(); deleted = d.getLeftOverSources(); diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java index 74a11a024a..082f31d178 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java @@ -12,6 +12,7 @@ package org.eclipse.jgit.diff; import static org.eclipse.jgit.diff.DiffEntry.Side.NEW; import static org.eclipse.jgit.diff.DiffEntry.Side.OLD; +import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD; import java.io.IOException; import java.util.ArrayList; @@ -80,6 +81,12 @@ class SimilarityRenameDetector { /** Score a pair must exceed to be considered a rename. */ private int renameScore = 60; + /** + * File size threshold (in bytes) for detecting renames. Files larger + * than this size will not be processed for renames. + */ + private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD; + /** Set if any {@link SimilarityIndex.TableFullException} occurs. */ private boolean tableOverflow; @@ -96,6 +103,10 @@ class SimilarityRenameDetector { renameScore = score; } + void setBigFileThreshold(int threshold) { + bigFileThreshold = threshold; + } + void compute(ProgressMonitor pm) throws IOException, CancelledException { if (pm == null) pm = NullProgressMonitor.INSTANCE; @@ -253,6 +264,11 @@ class SimilarityRenameDetector { continue; } + if (max > bigFileThreshold) { + pm.update(1); + continue; + } + if (s == null) { try { s = hash(OLD, srcEnt); |