123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430 |
- /*
- * Copyright (C) 2010, Google Inc. and others
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Distribution License v. 1.0 which is available at
- * https://www.eclipse.org/org/documents/edl-v10.php.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- */
-
- package org.eclipse.jgit.diff;
-
- import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
- import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
- import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;
-
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.BitSet;
- import java.util.List;
-
- import org.eclipse.jgit.diff.DiffEntry.ChangeType;
- import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
- import org.eclipse.jgit.errors.CancelledException;
- import org.eclipse.jgit.internal.JGitText;
- import org.eclipse.jgit.lib.FileMode;
- import org.eclipse.jgit.lib.NullProgressMonitor;
- import org.eclipse.jgit.lib.ObjectLoader;
- import org.eclipse.jgit.lib.ProgressMonitor;
-
- class SimilarityRenameDetector {
- /**
- * Number of bits we need to express an index into src or dst list.
- * <p>
- * This must be 28, giving us a limit of 2^28 entries in either list, which
- * is an insane limit of 536,870,912 file names being considered in a single
- * rename pass. The other 8 bits are used to store the score, while staying
- * under 127 so the long doesn't go negative.
- */
- private static final int BITS_PER_INDEX = 28;
-
- private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
-
- private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
-
- private ContentSource.Pair reader;
-
- /**
- * All sources to consider for copies or renames.
- * <p>
- * A source is typically a {@link ChangeType#DELETE} change, but could be
- * another type when trying to perform copy detection concurrently with
- * rename detection.
- */
- private List<DiffEntry> srcs;
-
- /**
- * All destinations to consider looking for a rename.
- * <p>
- * A destination is typically an {@link ChangeType#ADD}, as the name has
- * just come into existence, and we want to discover where its initial
- * content came from.
- */
- private List<DiffEntry> dsts;
-
- /**
- * Matrix of all examined file pairs, and their scores.
- * <p>
- * The upper 8 bits of each long stores the score, but the score is bounded
- * to be in the range (0, 128] so that the highest bit is never set, and all
- * entries are therefore positive.
- * <p>
- * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
- * as the lower two groups of 28 bits, respectively, but the encoding is
- * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
- * lower list indices later in the matrix, giving precedence to files whose
- * names sort earlier in the tree.
- */
- private long[] matrix;
-
- /** Score a pair must exceed to be considered a rename. */
- private int renameScore = 60;
-
- /**
- * File size threshold (in bytes) for detecting renames. Files larger
- * than this size will not be processed for renames.
- */
- private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;
-
- /** Skip content renames for binary files. */
- private boolean skipBinaryFiles = false;
-
- /** Set if any {@link SimilarityIndex.TableFullException} occurs. */
- private boolean tableOverflow;
-
- private List<DiffEntry> out;
-
- SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
- List<DiffEntry> dsts) {
- this.reader = reader;
- this.srcs = srcs;
- this.dsts = dsts;
- }
-
- void setRenameScore(int score) {
- renameScore = score;
- }
-
- void setBigFileThreshold(int threshold) {
- bigFileThreshold = threshold;
- }
-
- void setSkipBinaryFiles(boolean value) {
- skipBinaryFiles = value;
- }
-
- void compute(ProgressMonitor pm) throws IOException, CancelledException {
- if (pm == null)
- pm = NullProgressMonitor.INSTANCE;
-
- pm.beginTask(JGitText.get().renamesFindingByContent, //
- 2 * srcs.size() * dsts.size());
-
- int mNext = buildMatrix(pm);
- out = new ArrayList<>(Math.min(mNext, dsts.size()));
-
- // Match rename pairs on a first come, first serve basis until
- // we have looked at everything that is above our minimum score.
- //
- for (--mNext; mNext >= 0; mNext--) {
- if (pm.isCancelled()) {
- // TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
- // in next major version
- throw new CancelledException(JGitText.get().renameCancelled);
- }
- long ent = matrix[mNext];
- int sIdx = srcFile(ent);
- int dIdx = dstFile(ent);
- DiffEntry s = srcs.get(sIdx);
- DiffEntry d = dsts.get(dIdx);
-
- if (d == null) {
- pm.update(1);
- continue; // was already matched earlier
- }
-
- ChangeType type;
- if (s.changeType == ChangeType.DELETE) {
- // First use of this source file. Tag it as a rename so we
- // later know it is already been used as a rename, other
- // matches (if any) will claim themselves as copies instead.
- //
- s.changeType = ChangeType.RENAME;
- type = ChangeType.RENAME;
- } else {
- type = ChangeType.COPY;
- }
-
- out.add(DiffEntry.pair(type, s, d, score(ent)));
- dsts.set(dIdx, null); // Claim the destination was matched.
- pm.update(1);
- }
-
- srcs = compactSrcList(srcs);
- dsts = compactDstList(dsts);
- pm.endTask();
- }
-
- List<DiffEntry> getMatches() {
- return out;
- }
-
- List<DiffEntry> getLeftOverSources() {
- return srcs;
- }
-
- List<DiffEntry> getLeftOverDestinations() {
- return dsts;
- }
-
- boolean isTableOverflow() {
- return tableOverflow;
- }
-
- private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
- ArrayList<DiffEntry> r = new ArrayList<>(in.size());
- for (DiffEntry e : in) {
- if (e.changeType == ChangeType.DELETE)
- r.add(e);
- }
- return r;
- }
-
- private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
- ArrayList<DiffEntry> r = new ArrayList<>(in.size());
- for (DiffEntry e : in) {
- if (e != null)
- r.add(e);
- }
- return r;
- }
-
- private int buildMatrix(ProgressMonitor pm)
- throws IOException, CancelledException {
- // Allocate for the worst-case scenario where every pair has a
- // score that we need to consider. We might not need that many.
- //
- matrix = new long[srcs.size() * dsts.size()];
-
- long[] srcSizes = new long[srcs.size()];
- long[] dstSizes = new long[dsts.size()];
- BitSet dstTooLarge = null;
-
- // Consider each pair of files, if the score is above the minimum
- // threshold we need record that scoring in the matrix so we can
- // later find the best matches.
- //
- int mNext = 0;
- SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
- DiffEntry srcEnt = srcs.get(srcIdx);
- if (!isFile(srcEnt.oldMode)) {
- pm.update(dsts.size());
- continue;
- }
-
- SimilarityIndex s = null;
-
- for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
- if (pm.isCancelled()) {
- // TODO(ms): use
- // org.eclipse.jgit.api.errors.CanceledException in next
- // major version
- throw new CancelledException(
- JGitText.get().renameCancelled);
- }
-
- DiffEntry dstEnt = dsts.get(dstIdx);
-
- if (!isFile(dstEnt.newMode)) {
- pm.update(1);
- continue;
- }
-
- if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
- pm.update(1);
- continue;
- }
-
- if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
- pm.update(1);
- continue;
- }
-
- long srcSize = srcSizes[srcIdx];
- if (srcSize == 0) {
- srcSize = size(OLD, srcEnt) + 1;
- srcSizes[srcIdx] = srcSize;
- }
-
- long dstSize = dstSizes[dstIdx];
- if (dstSize == 0) {
- dstSize = size(NEW, dstEnt) + 1;
- dstSizes[dstIdx] = dstSize;
- }
-
- long max = Math.max(srcSize, dstSize);
- long min = Math.min(srcSize, dstSize);
- if (min * 100 / max < renameScore) {
- // Cannot possibly match, as the file sizes are so different
- pm.update(1);
- continue;
- }
-
- if (max > bigFileThreshold) {
- pm.update(1);
- continue;
- }
-
- if (s == null) {
- try {
- ObjectLoader loader = reader.open(OLD, srcEnt);
- if (skipBinaryFiles && SimilarityIndex.isBinary(loader)) {
- pm.update(1);
- continue SRC;
- }
- s = hash(loader);
- } catch (TableFullException tableFull) {
- tableOverflow = true;
- continue SRC;
- }
- }
-
- SimilarityIndex d;
- try {
- ObjectLoader loader = reader.open(NEW, dstEnt);
- if (skipBinaryFiles && SimilarityIndex.isBinary(loader)) {
- pm.update(1);
- continue;
- }
- d = hash(loader);
- } catch (TableFullException tableFull) {
- if (dstTooLarge == null)
- dstTooLarge = new BitSet(dsts.size());
- dstTooLarge.set(dstIdx);
- tableOverflow = true;
- pm.update(1);
- continue;
- }
-
- int contentScore = s.score(d, 10000);
-
- // nameScore returns a value between 0 and 100, but we want it
- // to be in the same range as the content score. This allows it
- // to be dropped into the pretty formula for the final score.
- int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
-
- int score = (contentScore * 99 + nameScore * 1) / 10000;
-
- if (score < renameScore) {
- pm.update(1);
- continue;
- }
-
- matrix[mNext++] = encode(score, srcIdx, dstIdx);
- pm.update(1);
- }
- }
-
- // Sort everything in the range we populated, which might be the
- // entire matrix, or just a smaller slice if we had some bad low
- // scoring pairs.
- //
- Arrays.sort(matrix, 0, mNext);
- return mNext;
- }
-
- static int nameScore(String a, String b) {
- int aDirLen = a.lastIndexOf('/') + 1;
- int bDirLen = b.lastIndexOf('/') + 1;
-
- int dirMin = Math.min(aDirLen, bDirLen);
- int dirMax = Math.max(aDirLen, bDirLen);
-
- final int dirScoreLtr;
- final int dirScoreRtl;
-
- if (dirMax == 0) {
- dirScoreLtr = 100;
- dirScoreRtl = 100;
- } else {
- int dirSim = 0;
- for (; dirSim < dirMin; dirSim++) {
- if (a.charAt(dirSim) != b.charAt(dirSim))
- break;
- }
- dirScoreLtr = (dirSim * 100) / dirMax;
-
- if (dirScoreLtr == 100) {
- dirScoreRtl = 100;
- } else {
- for (dirSim = 0; dirSim < dirMin; dirSim++) {
- if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
- - dirSim))
- break;
- }
- dirScoreRtl = (dirSim * 100) / dirMax;
- }
- }
-
- int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
- int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
-
- int fileSim = 0;
- for (; fileSim < fileMin; fileSim++) {
- if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
- - fileSim))
- break;
- }
- int fileScore = (fileSim * 100) / fileMax;
-
- return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
- }
-
- private SimilarityIndex hash(ObjectLoader objectLoader)
- throws IOException, TableFullException {
- SimilarityIndex r = new SimilarityIndex();
- r.hash(objectLoader);
- r.sort();
- return r;
- }
-
- private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
- return reader.size(side, ent);
- }
-
- private static int score(long value) {
- return (int) (value >>> SCORE_SHIFT);
- }
-
- static int srcFile(long value) {
- return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
- }
-
- static int dstFile(long value) {
- return decodeFile(((int) value) & INDEX_MASK);
- }
-
- static long encode(int score, int srcIdx, int dstIdx) {
- return (((long) score) << SCORE_SHIFT) //
- | (encodeFile(srcIdx) << BITS_PER_INDEX) //
- | encodeFile(dstIdx);
- }
-
- private static long encodeFile(int idx) {
- // We invert the index so that the first file in the list sorts
- // later in the table. This permits us to break ties favoring
- // earlier names over later ones.
- //
- return INDEX_MASK - idx;
- }
-
- private static int decodeFile(int v) {
- return INDEX_MASK - v;
- }
-
- private static boolean isFile(FileMode mode) {
- return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
- }
- }
|