source.dussan.org Git - sonarqube.git/blob

   1 /*
   2  * Sonar, open source software quality management tool.
   3  * Copyright (C) 2008-2012 SonarSource
   4  * mailto:contact AT sonarsource DOT com
   5  *
   6  * Sonar is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 3 of the License, or (at your option) any later version.
  10  *
  11  * Sonar is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Sonar; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
  19  */
  20 package org.sonar.duplications.detector.original;
  21
  22 import com.google.common.collect.Lists;
  23 import com.google.common.collect.Maps;
  24 import org.sonar.duplications.block.Block;
  25 import org.sonar.duplications.block.ByteArray;
  26 import org.sonar.duplications.index.CloneGroup;
  27 import org.sonar.duplications.index.CloneIndex;
  28 import org.sonar.duplications.index.ClonePart;
  29
  30 import java.util.Collection;
  31 import java.util.Collections;
  32 import java.util.List;
  33 import java.util.Map;
  34
  35 /**
  36  * Implementation of algorithm described in paper
  37  * <a href="http://www4.in.tum.de/~juergens/publications/icsm2010_crc.pdf">Index-Based Code Clone Detection: Incremental, Distributed, Scalable</a>
  38  * by Benjamin Hummel, Elmar Juergens, Michael Conradt and Lars Heinemann.
  39  */
  40 public final class OriginalCloneDetectionAlgorithm {
  41
  42   /**
  43    * Performs detection and returns list of clone groups between file (which represented as a collection of blocks) and index.
  44    * Note that this method ignores blocks for this file, that will be retrieved from index.
  45    */
  46   public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) {
  47     if (fileBlocks.isEmpty()) {
  48       return Collections.EMPTY_LIST;
  49     }
  50     OriginalCloneDetectionAlgorithm reporter = new OriginalCloneDetectionAlgorithm(cloneIndex);
  51     reporter.findClones(fileBlocks);
  52     return reporter.filter.getResult();
  53   }
  54
  55   private final CloneIndex cloneIndex;
  56
  57   private final Filter filter = new Filter();
  58
  59   private String originResourceId;
  60
  61   private OriginalCloneDetectionAlgorithm(CloneIndex cloneIndex) {
  62     this.cloneIndex = cloneIndex;
  63   }
  64
  65   private BlocksGroup[] createGroups(Collection<Block> fileBlocks) {
  66     // 2: let f be the list of tuples corresponding to filename sorted by statement index
  67     // either read from the index or calculated on the fly
  68     int size = fileBlocks.size();
  69
  70     // Godin: create one group per unique hash
  71     Map<ByteArray, BlocksGroup> groupsByHash = Maps.newHashMap(); // TODO Godin: can we create map with expected size?
  72     for (Block fileBlock : fileBlocks) {
  73       ByteArray hash = fileBlock.getBlockHash();
  74       BlocksGroup sameHash = groupsByHash.get(hash);
  75       if (sameHash == null) {
  76         sameHash = BlocksGroup.empty();
  77         groupsByHash.put(hash, sameHash);
  78       }
  79       sameHash.blocks.add(fileBlock);
  80     }
  81
  82     // Godin: retrieve blocks from index
  83     for (Map.Entry<ByteArray, BlocksGroup> entry : groupsByHash.entrySet()) {
  84       ByteArray hash = entry.getKey();
  85       BlocksGroup group = entry.getValue();
  86       for (Block blockFromIndex : cloneIndex.getBySequenceHash(hash)) {
  87         // Godin: skip blocks for this file if they come from index
  88         if (!originResourceId.equals(blockFromIndex.getResourceId())) {
  89           group.blocks.add(blockFromIndex);
  90         }
  91       }
  92       Collections.sort(group.blocks, BlocksGroup.BlockComparator.INSTANCE);
  93     }
  94
  95     // 3: let c be a list with c(0) = empty
  96     BlocksGroup[] sameHashBlocksGroups = new BlocksGroup[size + 2];
  97     sameHashBlocksGroups[0] = BlocksGroup.empty();
  98     // 4: for i := 1 to length(f) do
  99     for (Block fileBlock : fileBlocks) {
 100       ByteArray hash = fileBlock.getBlockHash();
 101       int i = fileBlock.getIndexInFile() + 1;
 102       // 5: retrieve tuples with same sequence hash as f(i)
 103       // 6: store this set as c(i)
 104       sameHashBlocksGroups[i] = groupsByHash.get(hash);
 105     }
 106
 107     // Godin: allows to report clones at the end of file, because condition at line 13 would be evaluated as true
 108     sameHashBlocksGroups[size + 1] = BlocksGroup.empty();
 109
 110     return sameHashBlocksGroups;
 111   }
 112
 113   private void findClones(Collection<Block> fileBlocks) {
 114     originResourceId = fileBlocks.iterator().next().getResourceId();
 115
 116     BlocksGroup[] sameHashBlocksGroups = createGroups(fileBlocks);
 117
 118     // 7: for i := 1 to length(c) do
 119     for (int i = 1; i < sameHashBlocksGroups.length; i++) {
 120       // In the main loop (starting from Line 7), we first check
 121       // whether any new clones might start at this position. If there
 122       // is only a single tuple with this hash (which has to belong
 123       // to the inspected file at the current location) we skip this loop
 124       // iteration. The same holds if all tuples at position i have already
 125       // been present at position i − 1, as in this case any clone group
 126       // found at position i would be included in a clone group starting
 127       // at position i − 1.
 128
 129       // Although we use the subset operator in the
 130       // algorithm description, this is not really a subset operation,
 131       // as of course the statement index of the tuples in c(i) will be
 132       // increased by 1 compared to the corresponding ones in c(i − 1)
 133       // and the hash and info fields will differ.
 134
 135       // 8: if |c(i)| < 2 or c(i) subsumed by c(i - 1) then
 136       if (sameHashBlocksGroups[i].size() < 2 || sameHashBlocksGroups[i].subsumedBy(sameHashBlocksGroups[i - 1], 1)) {
 137         // 9: continue with next loop iteration
 138         continue;
 139       }
 140
 141       // The set a introduced in Line 10 is called the active set and
 142       // contains all tuples corresponding to clones which have not yet
 143       // been reported. At each iteration of the inner loop the set a
 144       // is reduced to tuples which are also present in c(j); again the
 145       // intersection operator has to account for the increased statement
 146       // index and different hash and info fields. The new value is
 147       // stored in a0. Clones are only reported, if tuples are lost in
 148       // Line 12, as otherwise all current clones could be prolonged
 149       // by one statement. Clone reporting matches tuples that, after
 150       // correction of the statement index, appear in both c(i) and a;
 151       // each matched pair corresponds to a single clone. Its location
 152       // can be extracted from the filename and info fields.
 153
 154       // 10: let a := c(i)
 155       BlocksGroup currentBlocksGroup = sameHashBlocksGroups[i];
 156       // 11: for j := i + 1 to length(c) do
 157       for (int j = i + 1; j < sameHashBlocksGroups.length; j++) {
 158         // 12: let a0 := a intersect c(j)
 159         BlocksGroup intersectedBlocksGroup = currentBlocksGroup.intersect(sameHashBlocksGroups[j]);
 160
 161         // 13: if |a0| < |a| then
 162         if (intersectedBlocksGroup.size() < currentBlocksGroup.size()) {
 163           // 14: report clones from c(i) to a (see text)
 164
 165           // One problem of this algorithm is that clone classes with
 166           // multiple instances in the same file are encountered and
 167           // reported multiple times. Furthermore, when calculating the clone
 168           // groups for all files in a system, clone groups will be reported
 169           // more than once as well. Both cases can be avoided, by
 170           // checking whether the first element of a0 (with respect to a
 171           // fixed order) is equal to f(j) and only report in this case.
 172
 173           Block first = currentBlocksGroup.first(originResourceId);
 174           if (first.getIndexInFile() == j - 2) {
 175             // Godin: We report clones, which start in i-1 and end in j-2, so length is j-2-(i-1)+1=j-i
 176             reportClones(sameHashBlocksGroups[i], currentBlocksGroup, j - i);
 177           }
 178         }
 179         // 15: a := a0
 180         currentBlocksGroup = intersectedBlocksGroup;
 181
 182         // Line 16 early exits the inner loop if either no more clones are starting
 183         // from position i (i.e., a is too small), or if all tuples from a
 184         // have already been in c(i − 1), corrected for statement index.
 185         // In this case they have already been reported in the previous
 186         // iteration of the outer loop.
 187
 188         // IMPORTANT Godin: note that difference in indexes between "a" and "c(i-1)" greater than one,
 189         // so method subsumedBy should take this into account
 190
 191         // 16: if |a| < 2 or a subsumed by c(i-1) then
 192         if (currentBlocksGroup.size() < 2 || currentBlocksGroup.subsumedBy(sameHashBlocksGroups[i - 1], j - i + 1)) {
 193           // 17: break inner loop
 194           break;
 195         }
 196       }
 197     }
 198   }
 199
 200   private void reportClones(BlocksGroup beginGroup, BlocksGroup endGroup, int cloneLength) {
 201     List<Block[]> pairs = beginGroup.pairs(endGroup, cloneLength);
 202
 203     ClonePart origin = null;
 204     List<ClonePart> parts = Lists.newArrayList();
 205
 206     for (int i = 0; i < pairs.size(); i++) {
 207       Block[] pair = pairs.get(i);
 208       Block firstBlock = pair[0];
 209       Block lastBlock = pair[1];
 210       ClonePart part = new ClonePart(firstBlock.getResourceId(),
 211           firstBlock.getIndexInFile(),
 212           firstBlock.getStartLine(),
 213           lastBlock.getEndLine());
 214
 215       if (originResourceId.equals(part.getResourceId())) {
 216         if (origin == null) {
 217           origin = part;
 218         } else if (part.getUnitStart() < origin.getUnitStart()) {
 219           origin = part;
 220         }
 221       }
 222
 223       parts.add(part);
 224     }
 225
 226     filter.add(new CloneGroup(cloneLength, origin, parts));
 227   }
 228
 229 }