You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SimilarityRenameDetector.java 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. /*
  2. * Copyright (C) 2010, Google Inc. and others
  3. *
  4. * This program and the accompanying materials are made available under the
  5. * terms of the Eclipse Distribution License v. 1.0 which is available at
  6. * https://www.eclipse.org/org/documents/edl-v10.php.
  7. *
  8. * SPDX-License-Identifier: BSD-3-Clause
  9. */
  10. package org.eclipse.jgit.diff;
  11. import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
  12. import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
  13. import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;
  14. import java.io.IOException;
  15. import java.util.ArrayList;
  16. import java.util.Arrays;
  17. import java.util.BitSet;
  18. import java.util.List;
  19. import org.eclipse.jgit.diff.DiffEntry.ChangeType;
  20. import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
  21. import org.eclipse.jgit.errors.CancelledException;
  22. import org.eclipse.jgit.internal.JGitText;
  23. import org.eclipse.jgit.lib.FileMode;
  24. import org.eclipse.jgit.lib.NullProgressMonitor;
  25. import org.eclipse.jgit.lib.ProgressMonitor;
  26. class SimilarityRenameDetector {
  27. /**
  28. * Number of bits we need to express an index into src or dst list.
  29. * <p>
  30. * This must be 28, giving us a limit of 2^28 entries in either list, which
  31. * is an insane limit of 536,870,912 file names being considered in a single
  32. * rename pass. The other 8 bits are used to store the score, while staying
  33. * under 127 so the long doesn't go negative.
  34. */
  35. private static final int BITS_PER_INDEX = 28;
  36. private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
  37. private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
  38. private ContentSource.Pair reader;
  39. /**
  40. * All sources to consider for copies or renames.
  41. * <p>
  42. * A source is typically a {@link ChangeType#DELETE} change, but could be
  43. * another type when trying to perform copy detection concurrently with
  44. * rename detection.
  45. */
  46. private List<DiffEntry> srcs;
  47. /**
  48. * All destinations to consider looking for a rename.
  49. * <p>
  50. * A destination is typically an {@link ChangeType#ADD}, as the name has
  51. * just come into existence, and we want to discover where its initial
  52. * content came from.
  53. */
  54. private List<DiffEntry> dsts;
  55. /**
  56. * Matrix of all examined file pairs, and their scores.
  57. * <p>
  58. * The upper 8 bits of each long stores the score, but the score is bounded
  59. * to be in the range (0, 128] so that the highest bit is never set, and all
  60. * entries are therefore positive.
  61. * <p>
  62. * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
  63. * as the lower two groups of 28 bits, respectively, but the encoding is
  64. * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
  65. * lower list indices later in the matrix, giving precedence to files whose
  66. * names sort earlier in the tree.
  67. */
  68. private long[] matrix;
  69. /** Score a pair must exceed to be considered a rename. */
  70. private int renameScore = 60;
  71. /**
  72. * File size threshold (in bytes) for detecting renames. Files larger
  73. * than this size will not be processed for renames.
  74. */
  75. private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;
  76. /** Set if any {@link SimilarityIndex.TableFullException} occurs. */
  77. private boolean tableOverflow;
  78. private List<DiffEntry> out;
  79. SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
  80. List<DiffEntry> dsts) {
  81. this.reader = reader;
  82. this.srcs = srcs;
  83. this.dsts = dsts;
  84. }
  85. void setRenameScore(int score) {
  86. renameScore = score;
  87. }
  88. void setBigFileThreshold(int threshold) {
  89. bigFileThreshold = threshold;
  90. }
  91. void compute(ProgressMonitor pm) throws IOException, CancelledException {
  92. if (pm == null)
  93. pm = NullProgressMonitor.INSTANCE;
  94. pm.beginTask(JGitText.get().renamesFindingByContent, //
  95. 2 * srcs.size() * dsts.size());
  96. int mNext = buildMatrix(pm);
  97. out = new ArrayList<>(Math.min(mNext, dsts.size()));
  98. // Match rename pairs on a first come, first serve basis until
  99. // we have looked at everything that is above our minimum score.
  100. //
  101. for (--mNext; mNext >= 0; mNext--) {
  102. if (pm.isCancelled()) {
  103. // TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
  104. // in next major version
  105. throw new CancelledException(JGitText.get().renameCancelled);
  106. }
  107. long ent = matrix[mNext];
  108. int sIdx = srcFile(ent);
  109. int dIdx = dstFile(ent);
  110. DiffEntry s = srcs.get(sIdx);
  111. DiffEntry d = dsts.get(dIdx);
  112. if (d == null) {
  113. pm.update(1);
  114. continue; // was already matched earlier
  115. }
  116. ChangeType type;
  117. if (s.changeType == ChangeType.DELETE) {
  118. // First use of this source file. Tag it as a rename so we
  119. // later know it is already been used as a rename, other
  120. // matches (if any) will claim themselves as copies instead.
  121. //
  122. s.changeType = ChangeType.RENAME;
  123. type = ChangeType.RENAME;
  124. } else {
  125. type = ChangeType.COPY;
  126. }
  127. out.add(DiffEntry.pair(type, s, d, score(ent)));
  128. dsts.set(dIdx, null); // Claim the destination was matched.
  129. pm.update(1);
  130. }
  131. srcs = compactSrcList(srcs);
  132. dsts = compactDstList(dsts);
  133. pm.endTask();
  134. }
  135. List<DiffEntry> getMatches() {
  136. return out;
  137. }
  138. List<DiffEntry> getLeftOverSources() {
  139. return srcs;
  140. }
  141. List<DiffEntry> getLeftOverDestinations() {
  142. return dsts;
  143. }
  144. boolean isTableOverflow() {
  145. return tableOverflow;
  146. }
  147. private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
  148. ArrayList<DiffEntry> r = new ArrayList<>(in.size());
  149. for (DiffEntry e : in) {
  150. if (e.changeType == ChangeType.DELETE)
  151. r.add(e);
  152. }
  153. return r;
  154. }
  155. private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
  156. ArrayList<DiffEntry> r = new ArrayList<>(in.size());
  157. for (DiffEntry e : in) {
  158. if (e != null)
  159. r.add(e);
  160. }
  161. return r;
  162. }
  163. private int buildMatrix(ProgressMonitor pm)
  164. throws IOException, CancelledException {
  165. // Allocate for the worst-case scenario where every pair has a
  166. // score that we need to consider. We might not need that many.
  167. //
  168. matrix = new long[srcs.size() * dsts.size()];
  169. long[] srcSizes = new long[srcs.size()];
  170. long[] dstSizes = new long[dsts.size()];
  171. BitSet dstTooLarge = null;
  172. // Consider each pair of files, if the score is above the minimum
  173. // threshold we need record that scoring in the matrix so we can
  174. // later find the best matches.
  175. //
  176. int mNext = 0;
  177. SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
  178. DiffEntry srcEnt = srcs.get(srcIdx);
  179. if (!isFile(srcEnt.oldMode)) {
  180. pm.update(dsts.size());
  181. continue;
  182. }
  183. SimilarityIndex s = null;
  184. for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
  185. if (pm.isCancelled()) {
  186. // TODO(ms): use
  187. // org.eclipse.jgit.api.errors.CanceledException in next
  188. // major version
  189. throw new CancelledException(
  190. JGitText.get().renameCancelled);
  191. }
  192. DiffEntry dstEnt = dsts.get(dstIdx);
  193. if (!isFile(dstEnt.newMode)) {
  194. pm.update(1);
  195. continue;
  196. }
  197. if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
  198. pm.update(1);
  199. continue;
  200. }
  201. if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
  202. pm.update(1);
  203. continue;
  204. }
  205. long srcSize = srcSizes[srcIdx];
  206. if (srcSize == 0) {
  207. srcSize = size(OLD, srcEnt) + 1;
  208. srcSizes[srcIdx] = srcSize;
  209. }
  210. long dstSize = dstSizes[dstIdx];
  211. if (dstSize == 0) {
  212. dstSize = size(NEW, dstEnt) + 1;
  213. dstSizes[dstIdx] = dstSize;
  214. }
  215. long max = Math.max(srcSize, dstSize);
  216. long min = Math.min(srcSize, dstSize);
  217. if (min * 100 / max < renameScore) {
  218. // Cannot possibly match, as the file sizes are so different
  219. pm.update(1);
  220. continue;
  221. }
  222. if (max > bigFileThreshold) {
  223. pm.update(1);
  224. continue;
  225. }
  226. if (s == null) {
  227. try {
  228. s = hash(OLD, srcEnt);
  229. } catch (TableFullException tableFull) {
  230. tableOverflow = true;
  231. continue SRC;
  232. }
  233. }
  234. SimilarityIndex d;
  235. try {
  236. d = hash(NEW, dstEnt);
  237. } catch (TableFullException tableFull) {
  238. if (dstTooLarge == null)
  239. dstTooLarge = new BitSet(dsts.size());
  240. dstTooLarge.set(dstIdx);
  241. tableOverflow = true;
  242. pm.update(1);
  243. continue;
  244. }
  245. int contentScore = s.score(d, 10000);
  246. // nameScore returns a value between 0 and 100, but we want it
  247. // to be in the same range as the content score. This allows it
  248. // to be dropped into the pretty formula for the final score.
  249. int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
  250. int score = (contentScore * 99 + nameScore * 1) / 10000;
  251. if (score < renameScore) {
  252. pm.update(1);
  253. continue;
  254. }
  255. matrix[mNext++] = encode(score, srcIdx, dstIdx);
  256. pm.update(1);
  257. }
  258. }
  259. // Sort everything in the range we populated, which might be the
  260. // entire matrix, or just a smaller slice if we had some bad low
  261. // scoring pairs.
  262. //
  263. Arrays.sort(matrix, 0, mNext);
  264. return mNext;
  265. }
  266. static int nameScore(String a, String b) {
  267. int aDirLen = a.lastIndexOf('/') + 1;
  268. int bDirLen = b.lastIndexOf('/') + 1;
  269. int dirMin = Math.min(aDirLen, bDirLen);
  270. int dirMax = Math.max(aDirLen, bDirLen);
  271. final int dirScoreLtr;
  272. final int dirScoreRtl;
  273. if (dirMax == 0) {
  274. dirScoreLtr = 100;
  275. dirScoreRtl = 100;
  276. } else {
  277. int dirSim = 0;
  278. for (; dirSim < dirMin; dirSim++) {
  279. if (a.charAt(dirSim) != b.charAt(dirSim))
  280. break;
  281. }
  282. dirScoreLtr = (dirSim * 100) / dirMax;
  283. if (dirScoreLtr == 100) {
  284. dirScoreRtl = 100;
  285. } else {
  286. for (dirSim = 0; dirSim < dirMin; dirSim++) {
  287. if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
  288. - dirSim))
  289. break;
  290. }
  291. dirScoreRtl = (dirSim * 100) / dirMax;
  292. }
  293. }
  294. int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
  295. int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
  296. int fileSim = 0;
  297. for (; fileSim < fileMin; fileSim++) {
  298. if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
  299. - fileSim))
  300. break;
  301. }
  302. int fileScore = (fileSim * 100) / fileMax;
  303. return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
  304. }
  305. private SimilarityIndex hash(DiffEntry.Side side, DiffEntry ent)
  306. throws IOException, TableFullException {
  307. SimilarityIndex r = new SimilarityIndex();
  308. r.hash(reader.open(side, ent));
  309. r.sort();
  310. return r;
  311. }
  312. private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
  313. return reader.size(side, ent);
  314. }
  315. private static int score(long value) {
  316. return (int) (value >>> SCORE_SHIFT);
  317. }
  318. static int srcFile(long value) {
  319. return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
  320. }
  321. static int dstFile(long value) {
  322. return decodeFile(((int) value) & INDEX_MASK);
  323. }
  324. static long encode(int score, int srcIdx, int dstIdx) {
  325. return (((long) score) << SCORE_SHIFT) //
  326. | (encodeFile(srcIdx) << BITS_PER_INDEX) //
  327. | encodeFile(dstIdx);
  328. }
  329. private static long encodeFile(int idx) {
  330. // We invert the index so that the first file in the list sorts
  331. // later in the table. This permits us to break ties favoring
  332. // earlier names over later ones.
  333. //
  334. return INDEX_MASK - idx;
  335. }
  336. private static int decodeFile(int v) {
  337. return INDEX_MASK - v;
  338. }
  339. private static boolean isFile(FileMode mode) {
  340. return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
  341. }
  342. }