@@ -30,6 +30,10 @@ public abstract class AbstractText implements Text { | |||
this.symbols = new ArrayList<Object>(size); | |||
} | |||
public AbstractText(List<Object> symbols) { | |||
this.symbols = symbols; | |||
} | |||
public int length() { | |||
return symbols.size(); | |||
} |
@@ -37,7 +37,7 @@ import com.google.common.collect.Lists; | |||
*/ | |||
public class DuplicationsCollector implements Search.Collector { | |||
private final GeneralisedHashText text; | |||
private final TextSet text; | |||
private final String originResourceId; | |||
/** | |||
@@ -51,7 +51,7 @@ public class DuplicationsCollector implements Search.Collector { | |||
private ClonePart origin; | |||
private final List<ClonePart> parts = Lists.newArrayList(); | |||
public DuplicationsCollector(GeneralisedHashText text) { | |||
public DuplicationsCollector(TextSet text) { | |||
this.text = text; | |||
this.originResourceId = text.getBlock(0).getResourceId(); | |||
} | |||
@@ -162,9 +162,9 @@ public class DuplicationsCollector implements Search.Collector { | |||
// if (!first.getOriginPart().getResourceId().equals(second.getOriginPart().getResourceId())) { | |||
// return false; | |||
// } | |||
if (first.getCloneUnitLength() > second.getCloneUnitLength()) { | |||
return false; | |||
} | |||
// if (first.getCloneUnitLength() > second.getCloneUnitLength()) { | |||
// return false; | |||
// } | |||
List<ClonePart> firstParts = first.getCloneParts(); | |||
List<ClonePart> secondParts = second.getCloneParts(); | |||
return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength())) |
@@ -1,77 +0,0 @@ | |||
/* | |||
* Sonar, open source software quality management tool. | |||
* Copyright (C) 2008-2011 SonarSource | |||
* mailto:contact AT sonarsource DOT com | |||
* | |||
* Sonar is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 3 of the License, or (at your option) any later version. | |||
* | |||
* Sonar is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with Sonar; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 | |||
*/ | |||
package org.sonar.duplications.detector.suffixtree; | |||
import java.util.List; | |||
import org.sonar.duplications.block.Block; | |||
import com.google.common.collect.Lists; | |||
public class GeneralisedHashText extends TextSet { | |||
public GeneralisedHashText(List<Block>... blocks) { | |||
super(blocks.length); | |||
for (int i = 0; i < blocks.length; i++) { | |||
addAll(blocks[i]); | |||
addTerminator(); | |||
} | |||
finish(); | |||
} | |||
private int count; | |||
private List<Integer> sizes = Lists.newArrayList(); | |||
public void addBlock(Block block) { | |||
symbols.add(block); | |||
} | |||
public void addAll(List<Block> list) { | |||
symbols.addAll(list); | |||
} | |||
public void addTerminator() { | |||
symbols.add(new Terminator(count)); | |||
sizes.add(symbols.size()); | |||
count++; | |||
} | |||
public void finish() { | |||
super.lens = new int[sizes.size()]; | |||
for (int i = 0; i < sizes.size(); i++) { | |||
super.lens[i] = sizes.get(i); | |||
} | |||
} | |||
@Override | |||
public Object symbolAt(int index) { | |||
Object obj = super.symbolAt(index); | |||
if (obj instanceof Block) { | |||
return ((Block) obj).getBlockHash(); | |||
} | |||
return obj; | |||
} | |||
public Block getBlock(int index) { | |||
return (Block) super.symbolAt(index); | |||
} | |||
} |
@@ -31,7 +31,7 @@ public final class Search { | |||
private final List<Node> innerNodes = new ArrayList<Node>(); | |||
public static void perform(TextSet text, Collector reporter) { | |||
new Search(SuffixTree.create(text), text.lens, reporter).compute(); | |||
new Search(SuffixTree.create(text), text.getLens(), reporter).compute(); | |||
} | |||
private Search(SuffixTree tree, int[] lens, Collector reporter) { |
@@ -25,9 +25,8 @@ import org.sonar.duplications.block.Block; | |||
import org.sonar.duplications.block.ByteArray; | |||
import org.sonar.duplications.index.CloneGroup; | |||
import org.sonar.duplications.index.CloneIndex; | |||
import com.google.common.collect.Lists; | |||
import com.google.common.collect.Maps; | |||
import com.google.common.collect.Sets; | |||
import com.google.common.collect.*; | |||
public final class SuffixTreeCloneDetectionAlgorithm { | |||
@@ -35,7 +34,7 @@ public final class SuffixTreeCloneDetectionAlgorithm { | |||
if (fileBlocks.isEmpty()) { | |||
return Collections.EMPTY_LIST; | |||
} | |||
GeneralisedHashText text = retrieveFromIndex(cloneIndex, fileBlocks); | |||
TextSet text = createTextSet(cloneIndex, fileBlocks); | |||
if (text == null) { | |||
return Collections.EMPTY_LIST; | |||
} | |||
@@ -47,43 +46,30 @@ public final class SuffixTreeCloneDetectionAlgorithm { | |||
private SuffixTreeCloneDetectionAlgorithm() { | |||
} | |||
private static GeneralisedHashText retrieveFromIndex(CloneIndex index, Collection<Block> fileBlocks) { | |||
String originResourceId = fileBlocks.iterator().next().getResourceId(); | |||
private static TextSet createTextSet(CloneIndex index, Collection<Block> fileBlocks) { | |||
Set<ByteArray> hashes = Sets.newHashSet(); | |||
for (Block fileBlock : fileBlocks) { | |||
hashes.add(fileBlock.getBlockHash()); | |||
} | |||
Map<String, List<Block>> collection = Maps.newHashMap(); | |||
for (ByteArray hash : hashes) { | |||
Collection<Block> blocks = index.getBySequenceHash(hash); | |||
for (Block blockFromIndex : blocks) { | |||
// Godin: skip blocks for this file if they come from index | |||
String resourceId = blockFromIndex.getResourceId(); | |||
if (!originResourceId.equals(resourceId)) { | |||
List<Block> list = collection.get(resourceId); | |||
if (list == null) { | |||
list = Lists.newArrayList(); | |||
collection.put(resourceId, list); | |||
} | |||
list.add(blockFromIndex); | |||
} | |||
} | |||
} | |||
String originResourceId = fileBlocks.iterator().next().getResourceId(); | |||
Map<String, List<Block>> fromIndex = retrieveFromIndex(index, originResourceId, hashes); | |||
if (collection.isEmpty() && hashes.size() == fileBlocks.size()) { // optimization for the case when there is no duplications | |||
if (fromIndex.isEmpty() && hashes.size() == fileBlocks.size()) { // optimization for the case when there is no duplications | |||
return null; | |||
} | |||
GeneralisedHashText text = new GeneralisedHashText(); | |||
return createTextSet(fileBlocks, fromIndex); | |||
} | |||
private static TextSet createTextSet(Collection<Block> fileBlocks, Map<String, List<Block>> fromIndex) { | |||
TextSet.Builder textSetBuilder = TextSet.builder(); | |||
// TODO Godin: maybe we can reduce size of tree and so memory consumption by removing non-repeatable blocks | |||
List<Block> sortedFileBlocks = Lists.newArrayList(fileBlocks); | |||
Collections.sort(sortedFileBlocks, BLOCK_COMPARATOR); | |||
text.addAll(sortedFileBlocks); | |||
text.addTerminator(); | |||
textSetBuilder.add(sortedFileBlocks); | |||
for (List<Block> list : collection.values()) { | |||
for (List<Block> list : fromIndex.values()) { | |||
Collections.sort(list, BLOCK_COMPARATOR); | |||
int i = 0; | |||
@@ -92,15 +78,32 @@ public final class SuffixTreeCloneDetectionAlgorithm { | |||
while ((j < list.size()) && (list.get(j).getIndexInFile() == list.get(j - 1).getIndexInFile() + 1)) { | |||
j++; | |||
} | |||
text.addAll(list.subList(i, j)); | |||
text.addTerminator(); | |||
textSetBuilder.add(list.subList(i, j)); | |||
i = j; | |||
} | |||
} | |||
text.finish(); | |||
return textSetBuilder.build(); | |||
} | |||
return text; | |||
private static Map<String, List<Block>> retrieveFromIndex(CloneIndex index, String originResourceId, Set<ByteArray> hashes) { | |||
Map<String, List<Block>> collection = Maps.newHashMap(); | |||
for (ByteArray hash : hashes) { | |||
Collection<Block> blocks = index.getBySequenceHash(hash); | |||
for (Block blockFromIndex : blocks) { | |||
// Godin: skip blocks for this file if they come from index | |||
String resourceId = blockFromIndex.getResourceId(); | |||
if (!originResourceId.equals(resourceId)) { | |||
List<Block> list = collection.get(resourceId); | |||
if (list == null) { | |||
list = Lists.newArrayList(); | |||
collection.put(resourceId, list); | |||
} | |||
list.add(blockFromIndex); | |||
} | |||
} | |||
} | |||
return collection; | |||
} | |||
private static final Comparator<Block> BLOCK_COMPARATOR = new Comparator<Block>() { |
@@ -19,26 +19,67 @@ | |||
*/ | |||
package org.sonar.duplications.detector.suffixtree; | |||
import java.util.List; | |||
import org.sonar.duplications.block.Block; | |||
import com.google.common.collect.Lists; | |||
/** | |||
* Simplifies construction of <a href="http://en.wikipedia.org/wiki/Generalised_suffix_tree">generalised suffix-tree</a>. | |||
*/ | |||
public class TextSet extends AbstractText { | |||
int[] lens; | |||
public static class Builder { | |||
private List<Object> symbols = Lists.newArrayList(); | |||
private List<Integer> sizes = Lists.newArrayList(); | |||
private Builder() { | |||
} | |||
public void add(List<Block> list) { | |||
symbols.addAll(list); | |||
symbols.add(new Terminator(sizes.size())); | |||
sizes.add(symbols.size()); | |||
} | |||
public TextSet build() { | |||
int[] lens = new int[sizes.size()]; | |||
for (int i = 0; i < sizes.size(); i++) { | |||
lens[i] = sizes.get(i); | |||
} | |||
return new TextSet(symbols, lens); | |||
} | |||
} | |||
public static Builder builder() { | |||
return new Builder(); | |||
} | |||
public TextSet(int size) { | |||
super(100); // FIXME | |||
private int[] lens; | |||
lens = new int[size]; | |||
private TextSet(List<Object> symbols, int[] lens) { | |||
super(symbols); | |||
this.lens = lens; | |||
} | |||
public TextSet(Text... text) { | |||
this(text.length); | |||
for (int i = 0; i < text.length; i++) { | |||
symbols.addAll(text[i].sequence(0, text[i].length())); | |||
symbols.add(new Terminator(i)); | |||
lens[i] = symbols.size(); | |||
public int[] getLens() { | |||
return lens; | |||
} | |||
@Override | |||
public Object symbolAt(int index) { | |||
Object obj = super.symbolAt(index); | |||
if (obj instanceof Block) { | |||
return ((Block) obj).getBlockHash(); | |||
} | |||
return obj; | |||
} | |||
public Block getBlock(int index) { | |||
return (Block) super.symbolAt(index); | |||
} | |||
public static class Terminator { |
@@ -112,6 +112,52 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { | |||
newClonePart("x", 14, 2))); | |||
} | |||
/** | |||
* Given: | |||
* <pre> | |||
* a: 1 2 3 4 | |||
* b: 4 3 2 | |||
* c: 4 3 1 | |||
* </pre> | |||
* Expected: | |||
* <pre> | |||
* a-c (1) | |||
* a-b (2) | |||
* a-b-c (3) | |||
* a-b-c (4) | |||
* <pre> | |||
*/ | |||
@Test | |||
public void myTest3() { | |||
CloneIndex index = createIndex( | |||
newBlocks("b", "4 3 2"), | |||
newBlocks("c", "4 3 1") | |||
); | |||
List<Block> fileBlocks = newBlocks("a", "1 2 3 4"); | |||
List<CloneGroup> result = detect(index, fileBlocks); | |||
print(result); | |||
assertEquals(4, result.size()); | |||
assertThat(result, hasCloneGroup(1, | |||
newClonePart("a", 0, 1), | |||
newClonePart("c", 2, 1))); | |||
assertThat(result, hasCloneGroup(1, | |||
newClonePart("a", 1, 1), | |||
newClonePart("b", 2, 1))); | |||
assertThat(result, hasCloneGroup(1, | |||
newClonePart("a", 2, 1), | |||
newClonePart("b", 1, 1), | |||
newClonePart("c", 1, 1))); | |||
assertThat(result, hasCloneGroup(1, | |||
newClonePart("a", 3, 1), | |||
newClonePart("b", 0, 1), | |||
newClonePart("c", 0, 1))); | |||
} | |||
@Override | |||
protected List<CloneGroup> detect(CloneIndex index, List<Block> fileBlocks) { | |||
return SuffixTreeCloneDetectionAlgorithm.detect(index, fileBlocks); |