You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

SimilarityRenameDetector.java 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. /*
  2. * Copyright (C) 2010, Google Inc. and others
  3. *
  4. * This program and the accompanying materials are made available under the
  5. * terms of the Eclipse Distribution License v. 1.0 which is available at
  6. * https://www.eclipse.org/org/documents/edl-v10.php.
  7. *
  8. * SPDX-License-Identifier: BSD-3-Clause
  9. */
  10. package org.eclipse.jgit.diff;
  11. import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
  12. import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
  13. import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;
  14. import java.io.IOException;
  15. import java.util.ArrayList;
  16. import java.util.Arrays;
  17. import java.util.BitSet;
  18. import java.util.List;
  19. import org.eclipse.jgit.diff.DiffEntry.ChangeType;
  20. import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
  21. import org.eclipse.jgit.errors.CancelledException;
  22. import org.eclipse.jgit.internal.JGitText;
  23. import org.eclipse.jgit.lib.FileMode;
  24. import org.eclipse.jgit.lib.NullProgressMonitor;
  25. import org.eclipse.jgit.lib.ObjectLoader;
  26. import org.eclipse.jgit.lib.ProgressMonitor;
  27. class SimilarityRenameDetector {
  28. /**
  29. * Number of bits we need to express an index into src or dst list.
  30. * <p>
  31. * This must be 28, giving us a limit of 2^28 entries in either list, which
  32. * is an insane limit of 536,870,912 file names being considered in a single
  33. * rename pass. The other 8 bits are used to store the score, while staying
  34. * under 127 so the long doesn't go negative.
  35. */
  36. private static final int BITS_PER_INDEX = 28;
  37. private static final int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
  38. private static final int SCORE_SHIFT = 2 * BITS_PER_INDEX;
  39. private ContentSource.Pair reader;
  40. /**
  41. * All sources to consider for copies or renames.
  42. * <p>
  43. * A source is typically a {@link ChangeType#DELETE} change, but could be
  44. * another type when trying to perform copy detection concurrently with
  45. * rename detection.
  46. */
  47. private List<DiffEntry> srcs;
  48. /**
  49. * All destinations to consider looking for a rename.
  50. * <p>
  51. * A destination is typically an {@link ChangeType#ADD}, as the name has
  52. * just come into existence, and we want to discover where its initial
  53. * content came from.
  54. */
  55. private List<DiffEntry> dsts;
  56. /**
  57. * Matrix of all examined file pairs, and their scores.
  58. * <p>
  59. * The upper 8 bits of each long stores the score, but the score is bounded
  60. * to be in the range (0, 128] so that the highest bit is never set, and all
  61. * entries are therefore positive.
  62. * <p>
  63. * List indexes to an element of {@link #srcs} and {@link #dsts} are encoded
  64. * as the lower two groups of 28 bits, respectively, but the encoding is
  65. * inverted, so that 0 is expressed as {@code (1 << 28) - 1}. This sorts
  66. * lower list indices later in the matrix, giving precedence to files whose
  67. * names sort earlier in the tree.
  68. */
  69. private long[] matrix;
  70. /** Score a pair must exceed to be considered a rename. */
  71. private int renameScore = 60;
  72. /**
  73. * File size threshold (in bytes) for detecting renames. Files larger
  74. * than this size will not be processed for renames.
  75. */
  76. private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;
  77. /** Skip content renames for binary files. */
  78. private boolean skipBinaryFiles = false;
  79. /** Set if any {@link SimilarityIndex.TableFullException} occurs. */
  80. private boolean tableOverflow;
  81. private List<DiffEntry> out;
  82. SimilarityRenameDetector(ContentSource.Pair reader, List<DiffEntry> srcs,
  83. List<DiffEntry> dsts) {
  84. this.reader = reader;
  85. this.srcs = srcs;
  86. this.dsts = dsts;
  87. }
  88. void setRenameScore(int score) {
  89. renameScore = score;
  90. }
  91. void setBigFileThreshold(int threshold) {
  92. bigFileThreshold = threshold;
  93. }
  94. void setSkipBinaryFiles(boolean value) {
  95. skipBinaryFiles = value;
  96. }
  97. void compute(ProgressMonitor pm) throws IOException, CancelledException {
  98. if (pm == null)
  99. pm = NullProgressMonitor.INSTANCE;
  100. pm.beginTask(JGitText.get().renamesFindingByContent, //
  101. 2 * srcs.size() * dsts.size());
  102. int mNext = buildMatrix(pm);
  103. out = new ArrayList<>(Math.min(mNext, dsts.size()));
  104. // Match rename pairs on a first come, first serve basis until
  105. // we have looked at everything that is above our minimum score.
  106. //
  107. for (--mNext; mNext >= 0; mNext--) {
  108. if (pm.isCancelled()) {
  109. // TODO(ms): use org.eclipse.jgit.api.errors.CanceledException
  110. // in next major version
  111. throw new CancelledException(JGitText.get().renameCancelled);
  112. }
  113. long ent = matrix[mNext];
  114. int sIdx = srcFile(ent);
  115. int dIdx = dstFile(ent);
  116. DiffEntry s = srcs.get(sIdx);
  117. DiffEntry d = dsts.get(dIdx);
  118. if (d == null) {
  119. pm.update(1);
  120. continue; // was already matched earlier
  121. }
  122. ChangeType type;
  123. if (s.changeType == ChangeType.DELETE) {
  124. // First use of this source file. Tag it as a rename so we
  125. // later know it is already been used as a rename, other
  126. // matches (if any) will claim themselves as copies instead.
  127. //
  128. s.changeType = ChangeType.RENAME;
  129. type = ChangeType.RENAME;
  130. } else {
  131. type = ChangeType.COPY;
  132. }
  133. out.add(DiffEntry.pair(type, s, d, score(ent)));
  134. dsts.set(dIdx, null); // Claim the destination was matched.
  135. pm.update(1);
  136. }
  137. srcs = compactSrcList(srcs);
  138. dsts = compactDstList(dsts);
  139. pm.endTask();
  140. }
  141. List<DiffEntry> getMatches() {
  142. return out;
  143. }
  144. List<DiffEntry> getLeftOverSources() {
  145. return srcs;
  146. }
  147. List<DiffEntry> getLeftOverDestinations() {
  148. return dsts;
  149. }
  150. boolean isTableOverflow() {
  151. return tableOverflow;
  152. }
  153. private static List<DiffEntry> compactSrcList(List<DiffEntry> in) {
  154. ArrayList<DiffEntry> r = new ArrayList<>(in.size());
  155. for (DiffEntry e : in) {
  156. if (e.changeType == ChangeType.DELETE)
  157. r.add(e);
  158. }
  159. return r;
  160. }
  161. private static List<DiffEntry> compactDstList(List<DiffEntry> in) {
  162. ArrayList<DiffEntry> r = new ArrayList<>(in.size());
  163. for (DiffEntry e : in) {
  164. if (e != null)
  165. r.add(e);
  166. }
  167. return r;
  168. }
  169. private int buildMatrix(ProgressMonitor pm)
  170. throws IOException, CancelledException {
  171. // Allocate for the worst-case scenario where every pair has a
  172. // score that we need to consider. We might not need that many.
  173. //
  174. matrix = new long[srcs.size() * dsts.size()];
  175. long[] srcSizes = new long[srcs.size()];
  176. long[] dstSizes = new long[dsts.size()];
  177. BitSet dstTooLarge = null;
  178. // Consider each pair of files, if the score is above the minimum
  179. // threshold we need record that scoring in the matrix so we can
  180. // later find the best matches.
  181. //
  182. int mNext = 0;
  183. SRC: for (int srcIdx = 0; srcIdx < srcs.size(); srcIdx++) {
  184. DiffEntry srcEnt = srcs.get(srcIdx);
  185. if (!isFile(srcEnt.oldMode)) {
  186. pm.update(dsts.size());
  187. continue;
  188. }
  189. SimilarityIndex s = null;
  190. for (int dstIdx = 0; dstIdx < dsts.size(); dstIdx++) {
  191. if (pm.isCancelled()) {
  192. // TODO(ms): use
  193. // org.eclipse.jgit.api.errors.CanceledException in next
  194. // major version
  195. throw new CancelledException(
  196. JGitText.get().renameCancelled);
  197. }
  198. DiffEntry dstEnt = dsts.get(dstIdx);
  199. if (!isFile(dstEnt.newMode)) {
  200. pm.update(1);
  201. continue;
  202. }
  203. if (!RenameDetector.sameType(srcEnt.oldMode, dstEnt.newMode)) {
  204. pm.update(1);
  205. continue;
  206. }
  207. if (dstTooLarge != null && dstTooLarge.get(dstIdx)) {
  208. pm.update(1);
  209. continue;
  210. }
  211. long srcSize = srcSizes[srcIdx];
  212. if (srcSize == 0) {
  213. srcSize = size(OLD, srcEnt) + 1;
  214. srcSizes[srcIdx] = srcSize;
  215. }
  216. long dstSize = dstSizes[dstIdx];
  217. if (dstSize == 0) {
  218. dstSize = size(NEW, dstEnt) + 1;
  219. dstSizes[dstIdx] = dstSize;
  220. }
  221. long max = Math.max(srcSize, dstSize);
  222. long min = Math.min(srcSize, dstSize);
  223. if (min * 100 / max < renameScore) {
  224. // Cannot possibly match, as the file sizes are so different
  225. pm.update(1);
  226. continue;
  227. }
  228. if (max > bigFileThreshold) {
  229. pm.update(1);
  230. continue;
  231. }
  232. if (s == null) {
  233. try {
  234. ObjectLoader loader = reader.open(OLD, srcEnt);
  235. if (skipBinaryFiles && SimilarityIndex.isBinary(loader)) {
  236. pm.update(1);
  237. continue SRC;
  238. }
  239. s = hash(loader);
  240. } catch (TableFullException tableFull) {
  241. tableOverflow = true;
  242. continue SRC;
  243. }
  244. }
  245. SimilarityIndex d;
  246. try {
  247. ObjectLoader loader = reader.open(NEW, dstEnt);
  248. if (skipBinaryFiles && SimilarityIndex.isBinary(loader)) {
  249. pm.update(1);
  250. continue;
  251. }
  252. d = hash(loader);
  253. } catch (TableFullException tableFull) {
  254. if (dstTooLarge == null)
  255. dstTooLarge = new BitSet(dsts.size());
  256. dstTooLarge.set(dstIdx);
  257. tableOverflow = true;
  258. pm.update(1);
  259. continue;
  260. }
  261. int contentScore = s.score(d, 10000);
  262. // nameScore returns a value between 0 and 100, but we want it
  263. // to be in the same range as the content score. This allows it
  264. // to be dropped into the pretty formula for the final score.
  265. int nameScore = nameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
  266. int score = (contentScore * 99 + nameScore * 1) / 10000;
  267. if (score < renameScore) {
  268. pm.update(1);
  269. continue;
  270. }
  271. matrix[mNext++] = encode(score, srcIdx, dstIdx);
  272. pm.update(1);
  273. }
  274. }
  275. // Sort everything in the range we populated, which might be the
  276. // entire matrix, or just a smaller slice if we had some bad low
  277. // scoring pairs.
  278. //
  279. Arrays.sort(matrix, 0, mNext);
  280. return mNext;
  281. }
  282. static int nameScore(String a, String b) {
  283. int aDirLen = a.lastIndexOf('/') + 1;
  284. int bDirLen = b.lastIndexOf('/') + 1;
  285. int dirMin = Math.min(aDirLen, bDirLen);
  286. int dirMax = Math.max(aDirLen, bDirLen);
  287. final int dirScoreLtr;
  288. final int dirScoreRtl;
  289. if (dirMax == 0) {
  290. dirScoreLtr = 100;
  291. dirScoreRtl = 100;
  292. } else {
  293. int dirSim = 0;
  294. for (; dirSim < dirMin; dirSim++) {
  295. if (a.charAt(dirSim) != b.charAt(dirSim))
  296. break;
  297. }
  298. dirScoreLtr = (dirSim * 100) / dirMax;
  299. if (dirScoreLtr == 100) {
  300. dirScoreRtl = 100;
  301. } else {
  302. for (dirSim = 0; dirSim < dirMin; dirSim++) {
  303. if (a.charAt(aDirLen - 1 - dirSim) != b.charAt(bDirLen - 1
  304. - dirSim))
  305. break;
  306. }
  307. dirScoreRtl = (dirSim * 100) / dirMax;
  308. }
  309. }
  310. int fileMin = Math.min(a.length() - aDirLen, b.length() - bDirLen);
  311. int fileMax = Math.max(a.length() - aDirLen, b.length() - bDirLen);
  312. int fileSim = 0;
  313. for (; fileSim < fileMin; fileSim++) {
  314. if (a.charAt(a.length() - 1 - fileSim) != b.charAt(b.length() - 1
  315. - fileSim))
  316. break;
  317. }
  318. int fileScore = (fileSim * 100) / fileMax;
  319. return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
  320. }
  321. private SimilarityIndex hash(ObjectLoader objectLoader)
  322. throws IOException, TableFullException {
  323. SimilarityIndex r = new SimilarityIndex();
  324. r.hash(objectLoader);
  325. r.sort();
  326. return r;
  327. }
  328. private long size(DiffEntry.Side side, DiffEntry ent) throws IOException {
  329. return reader.size(side, ent);
  330. }
  331. private static int score(long value) {
  332. return (int) (value >>> SCORE_SHIFT);
  333. }
  334. static int srcFile(long value) {
  335. return decodeFile(((int) (value >>> BITS_PER_INDEX)) & INDEX_MASK);
  336. }
  337. static int dstFile(long value) {
  338. return decodeFile(((int) value) & INDEX_MASK);
  339. }
  340. static long encode(int score, int srcIdx, int dstIdx) {
  341. return (((long) score) << SCORE_SHIFT) //
  342. | (encodeFile(srcIdx) << BITS_PER_INDEX) //
  343. | encodeFile(dstIdx);
  344. }
  345. private static long encodeFile(int idx) {
  346. // We invert the index so that the first file in the list sorts
  347. // later in the table. This permits us to break ties favoring
  348. // earlier names over later ones.
  349. //
  350. return INDEX_MASK - idx;
  351. }
  352. private static int decodeFile(int v) {
  353. return INDEX_MASK - v;
  354. }
  355. private static boolean isFile(FileMode mode) {
  356. return (mode.getBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
  357. }
  358. }