3 * Copyright (C) 2009-2023 SonarSource SA
4 * mailto:info AT sonarsource DOT com
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 3 of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with this program; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 package org.sonar.ce.task.projectanalysis.filemove;
22 import com.google.common.collect.ArrayListMultimap;
23 import com.google.common.collect.ImmutableList;
24 import com.google.common.collect.ImmutableMap;
25 import com.google.common.collect.Multimap;
26 import com.google.common.collect.Sets;
27 import java.util.ArrayList;
28 import java.util.Arrays;
29 import java.util.Collection;
30 import java.util.Comparator;
31 import java.util.HashMap;
32 import java.util.HashSet;
33 import java.util.Iterator;
34 import java.util.List;
37 import java.util.stream.Collectors;
38 import javax.annotation.Nullable;
39 import javax.annotation.concurrent.Immutable;
40 import org.apache.ibatis.session.ResultContext;
41 import org.apache.ibatis.session.ResultHandler;
42 import org.sonar.api.utils.log.Logger;
43 import org.sonar.api.utils.log.Loggers;
44 import org.sonar.ce.task.projectanalysis.analysis.AnalysisMetadataHolder;
45 import org.sonar.ce.task.projectanalysis.component.Component;
46 import org.sonar.ce.task.projectanalysis.component.CrawlerDepthLimit;
47 import org.sonar.ce.task.projectanalysis.component.DepthTraversalTypeAwareCrawler;
48 import org.sonar.ce.task.projectanalysis.component.TreeRootHolder;
49 import org.sonar.ce.task.projectanalysis.component.TypeAwareVisitorAdapter;
50 import org.sonar.ce.task.projectanalysis.filemove.FileSimilarity.File;
51 import org.sonar.ce.task.projectanalysis.filemove.FileSimilarity.FileImpl;
52 import org.sonar.ce.task.projectanalysis.filemove.FileSimilarity.LazyFileImpl;
53 import org.sonar.ce.task.projectanalysis.source.SourceLinesHashRepository;
54 import org.sonar.ce.task.step.ComputationStep;
55 import org.sonar.core.util.logs.Profiler;
56 import org.sonar.core.util.stream.MoreCollectors;
57 import org.sonar.db.DbClient;
58 import org.sonar.db.DbSession;
59 import org.sonar.db.component.FileMoveRowDto;
60 import org.sonar.db.source.LineHashesWithUuidDto;
62 import static org.sonar.ce.task.projectanalysis.component.ComponentVisitor.Order.POST_ORDER;
64 public class FileMoveDetectionStep implements ComputationStep {
65 static final int MIN_REQUIRED_SCORE = 85;
66 private static final Logger LOG = Loggers.get(FileMoveDetectionStep.class);
67 private static final Comparator<ScoreMatrix.ScoreFile> SCORE_FILE_COMPARATOR = (o1, o2) -> -1 * Integer.compare(o1.getLineCount(), o2.getLineCount());
68 private static final double LOWER_BOUND_RATIO = 0.84;
69 private static final double UPPER_BOUND_RATIO = 1.18;
71 private final AnalysisMetadataHolder analysisMetadataHolder;
72 private final TreeRootHolder rootHolder;
73 private final DbClient dbClient;
74 private final FileSimilarity fileSimilarity;
75 private final MutableMovedFilesRepository movedFilesRepository;
76 private final SourceLinesHashRepository sourceLinesHash;
77 private final ScoreMatrixDumper scoreMatrixDumper;
78 private final MutableAddedFileRepository addedFileRepository;
80 public FileMoveDetectionStep(AnalysisMetadataHolder analysisMetadataHolder, TreeRootHolder rootHolder, DbClient dbClient,
81 FileSimilarity fileSimilarity, MutableMovedFilesRepository movedFilesRepository, SourceLinesHashRepository sourceLinesHash,
82 ScoreMatrixDumper scoreMatrixDumper, MutableAddedFileRepository addedFileRepository) {
83 this.analysisMetadataHolder = analysisMetadataHolder;
84 this.rootHolder = rootHolder;
85 this.dbClient = dbClient;
86 this.fileSimilarity = fileSimilarity;
87 this.movedFilesRepository = movedFilesRepository;
88 this.sourceLinesHash = sourceLinesHash;
89 this.scoreMatrixDumper = scoreMatrixDumper;
90 this.addedFileRepository = addedFileRepository;
94 public String getDescription() {
95 return "Detect file moves";
99 public void execute(ComputationStep.Context context) {
100 if (analysisMetadataHolder.isPullRequest()) {
101 LOG.debug("Currently within Pull Request scope. Do nothing.");
105 // do nothing if no files in db (first analysis)
106 if (analysisMetadataHolder.isFirstAnalysis()) {
107 LOG.debug("First analysis. Do nothing.");
110 Profiler p = Profiler.createIfTrace(LOG);
113 Map<String, Component> reportFilesByUuid = getReportFilesByUuid(this.rootHolder.getRoot());
114 context.getStatistics().add("reportFiles", reportFilesByUuid.size());
115 if (reportFilesByUuid.isEmpty()) {
116 LOG.debug("No files in report. No file move detection.");
120 Map<String, DbComponent> dbFilesByUuid = getDbFilesByUuid();
121 context.getStatistics().add("dbFiles", dbFilesByUuid.size());
123 Set<String> addedFileUuids = difference(reportFilesByUuid.keySet(), dbFilesByUuid.keySet());
124 context.getStatistics().add("addedFiles", addedFileUuids.size());
126 if (dbFilesByUuid.isEmpty()) {
127 registerAddedFiles(addedFileUuids, reportFilesByUuid, null);
128 LOG.debug("Previous snapshot has no file. No file move detection.");
132 Set<String> removedFileUuids = difference(dbFilesByUuid.keySet(), reportFilesByUuid.keySet());
134 // can't find matches if at least one of the added or removed files groups is empty => abort
135 if (addedFileUuids.isEmpty() || removedFileUuids.isEmpty()) {
136 registerAddedFiles(addedFileUuids, reportFilesByUuid, null);
137 LOG.debug("Either no files added or no files removed. Do nothing.");
141 // retrieve file data from report
142 Map<String, File> addedFileHashesByUuid = getReportFileHashesByUuid(reportFilesByUuid, addedFileUuids);
143 p.stopTrace("loaded");
145 // compute score matrix
147 ScoreMatrix scoreMatrix = computeScoreMatrix(dbFilesByUuid, removedFileUuids, addedFileHashesByUuid);
148 p.stopTrace("Score matrix computed");
149 scoreMatrixDumper.dumpAsCsv(scoreMatrix);
151 // not a single match with score higher than MIN_REQUIRED_SCORE => abort
152 if (scoreMatrix.getMaxScore() < MIN_REQUIRED_SCORE) {
153 context.getStatistics().add("movedFiles", 0);
154 registerAddedFiles(addedFileUuids, reportFilesByUuid, null);
155 LOG.debug("max score in matrix is less than min required score ({}). Do nothing.", MIN_REQUIRED_SCORE);
160 MatchesByScore matchesByScore = MatchesByScore.create(scoreMatrix);
162 ElectedMatches electedMatches = electMatches(removedFileUuids, addedFileHashesByUuid, matchesByScore);
163 p.stopTrace("Matches elected");
165 context.getStatistics().add("movedFiles", electedMatches.size());
166 registerMatches(dbFilesByUuid, reportFilesByUuid, electedMatches);
167 registerAddedFiles(addedFileUuids, reportFilesByUuid, electedMatches);
170 public Set<String> difference(Set<String> set1, Set<String> set2) {
171 if (set1.isEmpty() || set2.isEmpty()) {
174 return Sets.difference(set1, set2).immutableCopy();
177 private void registerMatches(Map<String, DbComponent> dbFilesByUuid, Map<String, Component> reportFilesByUuid, ElectedMatches electedMatches) {
178 LOG.debug("{} files moves found", electedMatches.size());
179 for (Match validatedMatch : electedMatches) {
180 movedFilesRepository.setOriginalFile(
181 reportFilesByUuid.get(validatedMatch.getReportUuid()),
182 toOriginalFile(dbFilesByUuid.get(validatedMatch.getDbUuid())));
183 LOG.trace("File move found: {}", validatedMatch);
187 private void registerAddedFiles(Set<String> addedFileUuids, Map<String, Component> reportFilesByUuid, @Nullable ElectedMatches electedMatches) {
188 if (electedMatches == null || electedMatches.isEmpty()) {
189 addedFileUuids.stream()
190 .map(reportFilesByUuid::get)
191 .forEach(addedFileRepository::register);
193 Set<String> reallyAddedFileUuids = new HashSet<>(addedFileUuids);
194 for (Match electedMatch : electedMatches) {
195 reallyAddedFileUuids.remove(electedMatch.getReportUuid());
197 reallyAddedFileUuids.stream()
198 .map(reportFilesByUuid::get)
199 .forEach(addedFileRepository::register);
203 private Map<String, DbComponent> getDbFilesByUuid() {
204 try (DbSession dbSession = dbClient.openSession(false)) {
205 ImmutableList.Builder<DbComponent> builder = ImmutableList.builder();
206 dbClient.componentDao().scrollAllFilesForFileMove(dbSession, rootHolder.getRoot().getUuid(),
208 FileMoveRowDto row = resultContext.getResultObject();
209 builder.add(new DbComponent(row.getKey(), row.getUuid(), row.getPath(), row.getLineCount()));
211 return builder.build().stream()
212 .collect(MoreCollectors.uniqueIndex(DbComponent::getUuid));
216 private static Map<String, Component> getReportFilesByUuid(Component root) {
217 final ImmutableMap.Builder<String, Component> builder = ImmutableMap.builder();
218 new DepthTraversalTypeAwareCrawler(
219 new TypeAwareVisitorAdapter(CrawlerDepthLimit.FILE, POST_ORDER) {
221 public void visitFile(Component file) {
222 builder.put(file.getUuid(), file);
225 return builder.build();
228 private Map<String, File> getReportFileHashesByUuid(Map<String, Component> reportFilesByUuid, Set<String> addedFileUuids) {
229 return addedFileUuids.stream().collect(Collectors.toMap(fileUuid -> fileUuid, fileUuid -> {
230 Component component = reportFilesByUuid.get(fileUuid);
231 return new LazyFileImpl(() -> getReportFileLineHashes(component), component.getFileAttributes().getLines());
235 private List<String> getReportFileLineHashes(Component component) {
236 // this is not ideal because if the file moved, this component won't exist in DB with the same UUID.
237 // Assuming that the file also had significant code before the move, it will be fine.
238 return sourceLinesHash.getLineHashesMatchingDBVersion(component);
241 private ScoreMatrix computeScoreMatrix(Map<String, DbComponent> dtosByUuid, Set<String> removedFileUuids, Map<String, File> addedFileHashesByUuid) {
242 ScoreMatrix.ScoreFile[] addedFiles = addedFileHashesByUuid.entrySet().stream()
243 .map(e -> new ScoreMatrix.ScoreFile(e.getKey(), e.getValue().getLineCount()))
244 .toArray(ScoreMatrix.ScoreFile[]::new);
245 ScoreMatrix.ScoreFile[] removedFiles = removedFileUuids.stream()
247 DbComponent dbComponent = dtosByUuid.get(key);
248 return new ScoreMatrix.ScoreFile(dbComponent.getUuid(), dbComponent.getLineCount());
250 .toArray(ScoreMatrix.ScoreFile[]::new);
252 // sort by highest line count first
253 Arrays.sort(addedFiles, SCORE_FILE_COMPARATOR);
254 Arrays.sort(removedFiles, SCORE_FILE_COMPARATOR);
255 int[][] scoreMatrix = new int[removedFiles.length][addedFiles.length];
256 int smallestAddedFileSize = addedFiles[0].getLineCount();
257 int largestAddedFileSize = addedFiles[addedFiles.length - 1].getLineCount();
259 Map<String, Integer> removedFilesIndexesByUuid = new HashMap<>(removedFileUuids.size());
260 for (int removeFileIndex = 0; removeFileIndex < removedFiles.length; removeFileIndex++) {
261 ScoreMatrix.ScoreFile removedFile = removedFiles[removeFileIndex];
262 int lowerBound = (int) Math.floor(removedFile.getLineCount() * LOWER_BOUND_RATIO);
263 int upperBound = (int) Math.ceil(removedFile.getLineCount() * UPPER_BOUND_RATIO);
264 // no need to compute score if all files are out of bound, so no need to load line hashes from DB
265 if (smallestAddedFileSize <= lowerBound || largestAddedFileSize >= upperBound) {
268 removedFilesIndexesByUuid.put(removedFile.getFileUuid(), removeFileIndex);
271 LineHashesWithKeyDtoResultHandler rowHandler = new LineHashesWithKeyDtoResultHandler(removedFilesIndexesByUuid, removedFiles,
272 addedFiles, addedFileHashesByUuid, scoreMatrix);
273 try (DbSession dbSession = dbClient.openSession(false)) {
274 dbClient.fileSourceDao().scrollLineHashes(dbSession, removedFilesIndexesByUuid.keySet(), rowHandler);
277 return new ScoreMatrix(removedFiles, addedFiles, scoreMatrix, rowHandler.getMaxScore());
280 private final class LineHashesWithKeyDtoResultHandler implements ResultHandler<LineHashesWithUuidDto> {
281 private final Map<String, Integer> removedFileIndexesByUuid;
282 private final ScoreMatrix.ScoreFile[] removedFiles;
283 private final ScoreMatrix.ScoreFile[] newFiles;
284 private final Map<String, File> newFilesByUuid;
285 private final int[][] scoreMatrix;
286 private int maxScore;
288 private LineHashesWithKeyDtoResultHandler(Map<String, Integer> removedFileIndexesByUuid, ScoreMatrix.ScoreFile[] removedFiles,
289 ScoreMatrix.ScoreFile[] newFiles, Map<String, File> newFilesByUuid,
290 int[][] scoreMatrix) {
291 this.removedFileIndexesByUuid = removedFileIndexesByUuid;
292 this.removedFiles = removedFiles;
293 this.newFiles = newFiles;
294 this.newFilesByUuid = newFilesByUuid;
295 this.scoreMatrix = scoreMatrix;
299 public void handleResult(ResultContext<? extends LineHashesWithUuidDto> resultContext) {
300 LineHashesWithUuidDto lineHashesDto = resultContext.getResultObject();
301 if (lineHashesDto.getPath() == null) {
304 int removedFileIndex = removedFileIndexesByUuid.get(lineHashesDto.getUuid());
305 ScoreMatrix.ScoreFile removedFile = removedFiles[removedFileIndex];
306 int lowerBound = (int) Math.floor(removedFile.getLineCount() * LOWER_BOUND_RATIO);
307 int upperBound = (int) Math.ceil(removedFile.getLineCount() * UPPER_BOUND_RATIO);
309 for (int newFileIndex = 0; newFileIndex < newFiles.length; newFileIndex++) {
310 ScoreMatrix.ScoreFile newFile = newFiles[newFileIndex];
311 if (newFile.getLineCount() >= upperBound) {
314 if (newFile.getLineCount() <= lowerBound) {
318 File fileHashesInDb = new FileImpl(lineHashesDto.getLineHashes());
319 File unmatchedFile = newFilesByUuid.get(newFile.getFileUuid());
320 int score = fileSimilarity.score(fileHashesInDb, unmatchedFile);
321 scoreMatrix[removedFileIndex][newFileIndex] = score;
322 if (score > maxScore) {
333 private static ElectedMatches electMatches(Set<String> dbFileUuids, Map<String, File> reportFileSourcesByUuid, MatchesByScore matchesByScore) {
334 ElectedMatches electedMatches = new ElectedMatches(matchesByScore, dbFileUuids, reportFileSourcesByUuid);
335 Multimap<String, Match> matchesPerFileForScore = ArrayListMultimap.create();
336 matchesByScore.forEach(matches -> electMatches(matches, electedMatches, matchesPerFileForScore));
337 return electedMatches;
340 private static void electMatches(@Nullable List<Match> matches, ElectedMatches electedMatches, Multimap<String, Match> matchesPerFileForScore) {
341 // no match for this score value, ignore
342 if (matches == null) {
346 List<Match> matchesToValidate = electedMatches.filter(matches);
347 if (matchesToValidate.isEmpty()) {
350 if (matchesToValidate.size() == 1) {
351 Match match = matchesToValidate.get(0);
352 electedMatches.add(match);
354 matchesPerFileForScore.clear();
355 for (Match match : matchesToValidate) {
356 matchesPerFileForScore.put(match.getDbUuid(), match);
357 matchesPerFileForScore.put(match.getReportUuid(), match);
359 // validate non-ambiguous matches (i.e. the match is the only match of either the db file and the report file)
360 for (Match match : matchesToValidate) {
361 int dbFileMatchesCount = matchesPerFileForScore.get(match.getDbUuid()).size();
362 int reportFileMatchesCount = matchesPerFileForScore.get(match.getReportUuid()).size();
363 if (dbFileMatchesCount == 1 && reportFileMatchesCount == 1) {
364 electedMatches.add(match);
370 private static MovedFilesRepository.OriginalFile toOriginalFile(DbComponent dbComponent) {
371 return new MovedFilesRepository.OriginalFile(dbComponent.getUuid(), dbComponent.getKey());
375 public static final class DbComponent {
376 private final String key;
377 private final String uuid;
378 private final String path;
379 private final int lineCount;
381 public DbComponent(String key, String uuid, String path, int lineCount) {
385 this.lineCount = lineCount;
388 public String getKey() {
392 public String getUuid() {
396 public String getPath() {
400 public int getLineCount() {
405 private static class ElectedMatches implements Iterable<Match> {
406 private final List<Match> matches;
407 private final Set<String> matchedFileUuids;
409 public ElectedMatches(MatchesByScore matchesByScore, Set<String> dbFileUuids, Map<String, File> reportFileHashesByUuid) {
410 this.matches = new ArrayList<>(matchesByScore.getSize());
411 this.matchedFileUuids = new HashSet<>(dbFileUuids.size() + reportFileHashesByUuid.size());
414 public void add(Match match) {
416 matchedFileUuids.add(match.getDbUuid());
417 matchedFileUuids.add(match.getReportUuid());
420 public List<Match> filter(Collection<Match> matches) {
421 return matches.stream().filter(this::notAlreadyMatched).collect(Collectors.toList());
424 private boolean notAlreadyMatched(Match input) {
425 return !(matchedFileUuids.contains(input.getDbUuid()) || matchedFileUuids.contains(input.getReportUuid()));
429 public Iterator<Match> iterator() {
430 return matches.iterator();
434 return matches.size();
437 public boolean isEmpty() {
438 return matches.isEmpty();