aboutsummaryrefslogtreecommitdiffstats
path: root/sonar-duplications/src
diff options
context:
space:
mode:
authorEvgeny Mandrikov <mandrikov@gmail.com>2011-12-07 02:37:47 +0400
committerEvgeny Mandrikov <mandrikov@gmail.com>2011-12-07 03:31:04 +0400
commitf0d96c1053f6345e82351522c5bd8ce5633008bd (patch)
treeb984e5f755b82ee21073b7ab967d314b22a34fe4 /sonar-duplications/src
parent9ae42f76ea1c21b43e3ea6ec044fe45276be9875 (diff)
downloadsonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.tar.gz
sonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.zip
SONAR-3060 Add new CPD algorithm based on suffix tree
Diffstat (limited to 'sonar-duplications/src')
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java45
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java200
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java79
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java77
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java88
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java137
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java86
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java109
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java111
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java41
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java73
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/CloneGroupMatcher.java81
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/DetectorTestCase.java439
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java486
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java47
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java96
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java106
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringText.java37
-rw-r--r--sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java120
19 files changed, 1977 insertions, 481 deletions
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java
new file mode 100644
index 00000000000..5a3ffa0c14c
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java
@@ -0,0 +1,45 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public abstract class AbstractText implements Text {
+
+ protected final List<Object> symbols;
+
+ public AbstractText(int size) {
+ this.symbols = new ArrayList<Object>(size);
+ }
+
+ public int length() {
+ return symbols.size();
+ }
+
+ public Object symbolAt(int index) {
+ return symbols.get(index);
+ }
+
+ public List<Object> sequence(int fromIndex, int toIndex) {
+ return symbols.subList(fromIndex, toIndex);
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java
new file mode 100644
index 00000000000..745fb90a3b3
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java
@@ -0,0 +1,200 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.ClonePart;
+import org.sonar.duplications.utils.FastStringComparator;
+import org.sonar.duplications.utils.SortedListsUtils;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s.
+ */
+public class DuplicationsCollector implements Search.Collector {
+
+ private final GeneralisedHashText text;
+ private final String originResourceId;
+
+ /**
+ * Note that LinkedList should provide better performance here, because of use of operation remove.
+ *
+ * @see #filter(CloneGroup)
+ */
+ private final List<CloneGroup> filtered = Lists.newLinkedList();
+
+ private int length;
+ private ClonePart origin;
+ private final List<ClonePart> parts = Lists.newArrayList();
+
+ public DuplicationsCollector(GeneralisedHashText text) {
+ this.text = text;
+ this.originResourceId = text.getBlock(0).getResourceId();
+ }
+
+ /**
+ * @return current result
+ */
+ public List<CloneGroup> getResult() {
+ return filtered;
+ }
+
+ public void part(int start, int end, int len) {
+ length = len;
+ Block firstBlock = text.getBlock(start);
+ Block lastBlock = text.getBlock(end - 1);
+
+ ClonePart part = new ClonePart(
+ firstBlock.getResourceId(),
+ firstBlock.getIndexInFile(),
+ firstBlock.getFirstLineNumber(),
+ lastBlock.getLastLineNumber());
+
+ // TODO Godin: maybe use FastStringComparator here ?
+ if (originResourceId.equals(part.getResourceId())) { // part from origin
+ if (origin == null) {
+ origin = part;
+ } else if (part.getUnitStart() < origin.getUnitStart()) {
+ origin = part;
+ }
+ }
+
+ parts.add(part);
+ }
+
+ public void endOfGroup() {
+ Collections.sort(parts, CLONEPART_COMPARATOR);
+ CloneGroup group = new CloneGroup(length, origin, parts);
+ filter(group);
+
+ parts.clear();
+ origin = null;
+ }
+
+ private void filter(CloneGroup current) {
+ Iterator<CloneGroup> i = filtered.iterator();
+ while (i.hasNext()) {
+ CloneGroup earlier = i.next();
+ // Note that following two conditions cannot be true together - proof by contradiction:
+ // let C be the current clone and A and B were found earlier
+ // then since relation is transitive - (A in C) and (C in B) => (A in B)
+ // so A should be filtered earlier
+ if (containsIn(current, earlier)) {
+ // current clone fully covered by clone, which was found earlier
+ return;
+ }
+ // TODO Godin: must prove that this is unused
+ // if (containsIn(earlier, current)) {
+ // // current clone fully covers clone, which was found earlier
+ // i.remove();
+ // }
+ }
+ filtered.add(current);
+ }
+
+ private static final Comparator<ClonePart> CLONEPART_COMPARATOR = new Comparator<ClonePart>() {
+ public int compare(ClonePart o1, ClonePart o2) {
+ int c = RESOURCE_ID_COMPARATOR.compare(o1, o2);
+ if (c == 0) {
+ return o1.getUnitStart() - o2.getUnitStart();
+ }
+ return c;
+ }
+ };
+
+ private static final Comparator<ClonePart> RESOURCE_ID_COMPARATOR = new Comparator<ClonePart>() {
+ public int compare(ClonePart o1, ClonePart o2) {
+ return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId());
+ }
+ };
+
+ /**
+ * Checks that second clone contains first one.
+ * <p>
+ * Clone A is contained in another clone B, if every part pA from A has part pB in B,
+ * which satisfy the conditions:
+ * <pre>
+ * (pA.resourceId == pB.resourceId) and (pB.unitStart <= pA.unitStart) and (pA.unitEnd <= pB.unitEnd)
+ * </pre>
+ * And all resourcesId from B exactly the same as all resourceId from A, which means that also every part pB from B has part pA in A,
+ * which satisfy the condition:
+ * <pre>
+ * pB.resourceId == pA.resourceId
+ * </pre>
+ * So this relation is:
+ * <ul>
+ * <li>reflexive - A in A</li>
+ * <li>transitive - (A in B) and (B in C) => (A in C)</li>
+ * <li>antisymmetric - (A in B) and (B in A) <=> (A = B)</li>
+ * </ul>
+ * </p>
+ * <p>
+ * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link #CLONEPART_COMPARATOR}),
+ * so running time - O(|A|+|B|).
+ * </p>
+ */
+ private static boolean containsIn(CloneGroup first, CloneGroup second) {
+ // TODO Godin: must prove that this is unused
+ // if (!first.getOriginPart().getResourceId().equals(second.getOriginPart().getResourceId())) {
+ // return false;
+ // }
+ if (first.getCloneUnitLength() > second.getCloneUnitLength()) {
+ return false;
+ }
+ List<ClonePart> firstParts = first.getCloneParts();
+ List<ClonePart> secondParts = second.getCloneParts();
+ return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength()))
+ && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR);
+ }
+
+ private static class ContainsInComparator implements Comparator<ClonePart> {
+ private final int l1, l2;
+
+ public ContainsInComparator(int l1, int l2) {
+ this.l1 = l1;
+ this.l2 = l2;
+ }
+
+ public int compare(ClonePart o1, ClonePart o2) {
+ int c = RESOURCE_ID_COMPARATOR.compare(o1, o2);
+ if (c == 0) {
+ if (o2.getUnitStart() <= o1.getUnitStart()) {
+ if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) {
+ return 0; // match found - stop search
+ } else {
+ return 1; // continue search
+ }
+ } else {
+ return -1; // o1 < o2 by unitStart - stop search
+ }
+ } else {
+ return c;
+ }
+ }
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java
new file mode 100644
index 00000000000..968fbd9e226
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java
@@ -0,0 +1,79 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+public final class Edge {
+
+ private int beginIndex; // can't be changed
+ private int endIndex;
+ private Node startNode;
+ private Node endNode; // can't be changed, could be used as edge id
+
+ // each time edge is created, a new end node is created
+ public Edge(int beginIndex, int endIndex, Node startNode) {
+ this.beginIndex = beginIndex;
+ this.endIndex = endIndex;
+ this.startNode = startNode;
+ this.endNode = new Node(startNode, null);
+ }
+
+ public Node splitEdge(Suffix suffix) {
+ remove();
+ Edge newEdge = new Edge(beginIndex, beginIndex + suffix.getSpan(), suffix.getOriginNode());
+ newEdge.insert();
+ newEdge.endNode.setSuffixNode(suffix.getOriginNode());
+ beginIndex += suffix.getSpan() + 1;
+ startNode = newEdge.getEndNode();
+ insert();
+ return newEdge.getEndNode();
+ }
+
+ public void insert() {
+ startNode.addEdge(beginIndex, this);
+ }
+
+ public void remove() {
+ startNode.removeEdge(beginIndex);
+ }
+
+ /**
+ * @return length of this edge in symbols
+ */
+ public int getSpan() {
+ return endIndex - beginIndex;
+ }
+
+ public int getBeginIndex() {
+ return beginIndex;
+ }
+
+ public int getEndIndex() {
+ return endIndex;
+ }
+
+ public Node getStartNode() {
+ return startNode;
+ }
+
+ public Node getEndNode() {
+ return endNode;
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java
new file mode 100644
index 00000000000..202cb85084c
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java
@@ -0,0 +1,77 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.List;
+
+import org.sonar.duplications.block.Block;
+
+import com.google.common.collect.Lists;
+
+public class GeneralisedHashText extends TextSet {
+
+ public GeneralisedHashText(List<Block>... blocks) {
+ super(blocks.length);
+
+ for (int i = 0; i < blocks.length; i++) {
+ addAll(blocks[i]);
+ addTerminator();
+ }
+ finish();
+ }
+
+ private int count;
+ private List<Integer> sizes = Lists.newArrayList();
+
+ public void addBlock(Block block) {
+ symbols.add(block);
+ }
+
+ public void addAll(List<Block> list) {
+ symbols.addAll(list);
+ }
+
+ public void addTerminator() {
+ symbols.add(new Terminator(count));
+ sizes.add(symbols.size());
+ count++;
+ }
+
+ public void finish() {
+ super.lens = new int[sizes.size()];
+ for (int i = 0; i < sizes.size(); i++) {
+ super.lens[i] = sizes.get(i);
+ }
+ }
+
+ @Override
+ public Object symbolAt(int index) {
+ Object obj = super.symbolAt(index);
+ if (obj instanceof Block) {
+ return ((Block) obj).getBlockHash();
+ }
+ return obj;
+ }
+
+ public Block getBlock(int index) {
+ return (Block) super.symbolAt(index);
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java
new file mode 100644
index 00000000000..8fa39ecc832
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java
@@ -0,0 +1,88 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Collection;
+
+public final class Node {
+
+ private final SuffixTree suffixTree;
+ private final Map<Object, Edge> edges;
+
+ /**
+ * Node represents string s[i],s[i+1],...,s[j],
+ * suffix-link is a link to node, which represents string s[i+1],...,s[j].
+ */
+ private Node suffixNode;
+
+ /**
+ * Number of symbols from the root to this node.
+ * <p>
+ * Note that this is not equal to number of nodes from root to this node,
+ * because in a compact suffix-tree edge can span multiple symbols - see {@link Edge#getSpan()}.
+ * </p><p>
+ * Depth of {@link #suffixNode} is always equal to this depth minus one.
+ * </p>
+ */
+ int depth;
+
+ int startSize, endSize;
+
+ public Node(Node node, Node suffixNode) {
+ this(node.suffixTree, suffixNode);
+ }
+
+ public Node(SuffixTree suffixTree, Node suffixNode) {
+ this.suffixTree = suffixTree;
+ this.suffixNode = suffixNode;
+ edges = new HashMap<Object, Edge>();
+ }
+
+ public Object symbolAt(int index) {
+ return suffixTree.symbolAt(index);
+ }
+
+ public void addEdge(int charIndex, Edge edge) {
+ edges.put(symbolAt(charIndex), edge);
+ }
+
+ public void removeEdge(int charIndex) {
+ edges.remove(symbolAt(charIndex));
+ }
+
+ public Edge findEdge(Object ch) {
+ return edges.get(ch);
+ }
+
+ public Node getSuffixNode() {
+ return suffixNode;
+ }
+
+ public void setSuffixNode(Node suffixNode) {
+ this.suffixNode = suffixNode;
+ }
+
+ public Collection<Edge> getEdges() {
+ return edges.values();
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java
new file mode 100644
index 00000000000..f13410e90d4
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java
@@ -0,0 +1,137 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.*;
+
+public final class Search {
+
+ private final SuffixTree tree;
+ private final int[] lens;
+ private final Collector reporter;
+
+ private final List<Integer> list = new ArrayList<Integer>();
+ private final List<Node> innerNodes = new ArrayList<Node>();
+
+ public static void perform(TextSet text, Collector reporter) {
+ new Search(SuffixTree.create(text), text.lens, reporter).compute();
+ }
+
+ private Search(SuffixTree tree, int[] lens, Collector reporter) {
+ this.tree = tree;
+ this.lens = lens;
+ this.reporter = reporter;
+ }
+
+ private void compute() {
+ // O(N)
+ computeDepth();
+
+ // O(N * log(N))
+ Collections.sort(innerNodes, DEPTH_COMPARATOR);
+
+ // O(N), recursive
+ createListOfLeafs(tree.getRootNode());
+
+ // O(N)
+ visitInnerNodes();
+ }
+
+ private void computeDepth() {
+ Queue<Node> queue = new LinkedList<Node>();
+ queue.add(tree.getRootNode());
+ tree.getRootNode().depth = 0;
+ while (!queue.isEmpty()) {
+ Node node = queue.remove();
+ if (!node.getEdges().isEmpty()) {
+ if (node != tree.getRootNode()) { // inner node = not leaf and not root
+ innerNodes.add(node);
+ }
+ for (Edge edge : node.getEdges()) {
+ Node endNode = edge.getEndNode();
+ endNode.depth = node.depth + edge.getSpan() + 1;
+ queue.add(endNode);
+ }
+ }
+ }
+ }
+
+ private static final Comparator<Node> DEPTH_COMPARATOR = new Comparator<Node>() {
+ public int compare(Node o1, Node o2) {
+ return o2.depth - o1.depth;
+ }
+ };
+
+ private void createListOfLeafs(Node node) {
+ node.startSize = list.size();
+ if (node.getEdges().isEmpty()) { // leaf
+ list.add(node.depth);
+ } else {
+ for (Edge edge : node.getEdges()) {
+ createListOfLeafs(edge.getEndNode());
+ }
+ node.endSize = list.size();
+ }
+ }
+
+ /**
+ * Each inner-node represents prefix of some suffixes, thus substring of text.
+ */
+ private void visitInnerNodes() {
+ for (Node node : innerNodes) {
+ if (containsOrigin(node)) {
+ report(node);
+ }
+ }
+ }
+
+ private boolean containsOrigin(Node node) {
+ for (int i = node.startSize; i < node.endSize; i++) {
+ int start = tree.text.length() - list.get(i);
+ int end = start + node.depth;
+ if (end < lens[0]) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private void report(Node node) {
+ for (int i = node.startSize; i < node.endSize; i++) {
+ int start = tree.text.length() - list.get(i);
+ int end = start + node.depth;
+ reporter.part(start, end, node.depth);
+ }
+ reporter.endOfGroup();
+ }
+
+ public interface Collector {
+
+ /**
+ * @param start start position in generalised text
+ * @param end end position in generalised text
+ */
+ void part(int start, int end, int len);
+
+ void endOfGroup();
+
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java
new file mode 100644
index 00000000000..2f0165d7d41
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java
@@ -0,0 +1,86 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+public final class Suffix {
+
+ private Node originNode;
+ private int beginIndex;
+ private int endIndex;
+
+ public Suffix(Node originNode, int beginIndex, int endIndex) {
+ this.originNode = originNode;
+ this.beginIndex = beginIndex;
+ this.endIndex = endIndex;
+ }
+
+ public boolean isExplicit() {
+ return beginIndex > endIndex;
+ }
+
+ public boolean isImplicit() {
+ return !isExplicit();
+ }
+
+ public void canonize() {
+ if (isImplicit()) {
+ Edge edge = originNode.findEdge(originNode.symbolAt(beginIndex));
+
+ int edgeSpan = edge.getSpan();
+ while (edgeSpan <= getSpan()) {
+ beginIndex += edgeSpan + 1;
+ originNode = edge.getEndNode();
+ if (beginIndex <= endIndex) {
+ edge = edge.getEndNode().findEdge(originNode.symbolAt(beginIndex));
+ edgeSpan = edge.getSpan();
+ }
+ }
+ }
+ }
+
+ public int getSpan() {
+ return endIndex - beginIndex;
+ }
+
+ public Node getOriginNode() {
+ return originNode;
+ }
+
+ public int getBeginIndex() {
+ return beginIndex;
+ }
+
+ public void incBeginIndex() {
+ beginIndex++;
+ }
+
+ public void changeOriginNode() {
+ originNode = originNode.getSuffixNode();
+ }
+
+ public int getEndIndex() {
+ return endIndex;
+ }
+
+ public void incEndIndex() {
+ endIndex++;
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java
new file mode 100644
index 00000000000..b6e21c940c7
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java
@@ -0,0 +1,109 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import com.google.common.base.Objects;
+
+/**
+ * The implementation of the algorithm for constructing suffix-tree based on
+ * <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of
+ * <a href="http://marknelson.us/1996/08/01/suffix-trees/">Mark Nelson's C++ implementation of Ukkonen's algorithm</a>.
+ */
+public final class SuffixTree {
+
+ final Text text;
+
+ private final Node root;
+
+ public static SuffixTree create(Text text) {
+ SuffixTree tree = new SuffixTree(text);
+ Suffix active = new Suffix(tree.root, 0, -1);
+ for (int i = 0; i < text.length(); i++) {
+ tree.addPrefix(active, i);
+ }
+ return tree;
+ }
+
+ private SuffixTree(Text text) {
+ this.text = text;
+ root = new Node(this, null);
+ }
+
+ private void addPrefix(Suffix active, int endIndex) {
+ Node lastParentNode = null;
+ Node parentNode;
+
+ while (true) {
+ Edge edge;
+ parentNode = active.getOriginNode();
+
+ // Step 1 is to try and find a matching edge for the given node.
+ // If a matching edge exists, we are done adding edges, so we break out of this big loop.
+ if (active.isExplicit()) {
+ edge = active.getOriginNode().findEdge(symbolAt(endIndex));
+ if (edge != null) {
+ break;
+ }
+ } else {
+ // implicit node, a little more complicated
+ edge = active.getOriginNode().findEdge(symbolAt(active.getBeginIndex()));
+ int span = active.getSpan();
+ if (Objects.equal(symbolAt(edge.getBeginIndex() + span + 1), symbolAt(endIndex))) {
+ break;
+ }
+ parentNode = edge.splitEdge(active);
+ }
+
+ // We didn't find a matching edge, so we create a new one, add it to the tree at the parent node position,
+ // and insert it into the hash table. When we create a new node, it also means we need to create
+ // a suffix link to the new node from the last node we visited.
+ Edge newEdge = new Edge(endIndex, text.length() - 1, parentNode);
+ newEdge.insert();
+ updateSuffixNode(lastParentNode, parentNode);
+ lastParentNode = parentNode;
+
+ // This final step is where we move to the next smaller suffix
+ if (active.getOriginNode() == root) {
+ active.incBeginIndex();
+ } else {
+ active.changeOriginNode();
+ }
+ active.canonize();
+ }
+ updateSuffixNode(lastParentNode, parentNode);
+ active.incEndIndex(); // Now the endpoint is the next active point
+ active.canonize();
+ }
+
+ private void updateSuffixNode(Node node, Node suffixNode) {
+ if ((node != null) && (node != root)) {
+ node.setSuffixNode(suffixNode);
+ }
+ }
+
+ public Object symbolAt(int index) {
+ return text.symbolAt(index);
+ }
+
+ public Node getRootNode() {
+ return root;
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java
new file mode 100644
index 00000000000..0bd13a88cbf
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java
@@ -0,0 +1,111 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.*;
+
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.block.ByteArray;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.CloneIndex;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+public final class SuffixTreeCloneDetectionAlgorithm {
+
+ public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) {
+ if (fileBlocks.isEmpty()) {
+ return Collections.EMPTY_LIST;
+ }
+ GeneralisedHashText text = retrieveFromIndex(cloneIndex, fileBlocks);
+ if (text == null) {
+ return Collections.EMPTY_LIST;
+ }
+ DuplicationsCollector reporter = new DuplicationsCollector(text);
+ Search.perform(text, reporter);
+ return reporter.getResult();
+ }
+
+ private SuffixTreeCloneDetectionAlgorithm() {
+ }
+
+ private static GeneralisedHashText retrieveFromIndex(CloneIndex index, Collection<Block> fileBlocks) {
+ String originResourceId = fileBlocks.iterator().next().getResourceId();
+
+ Set<ByteArray> hashes = Sets.newHashSet();
+ for (Block fileBlock : fileBlocks) {
+ hashes.add(fileBlock.getBlockHash());
+ }
+
+ Map<String, List<Block>> collection = Maps.newHashMap();
+ for (ByteArray hash : hashes) {
+ Collection<Block> blocks = index.getBySequenceHash(hash);
+ for (Block blockFromIndex : blocks) {
+ // Godin: skip blocks for this file if they come from index
+ String resourceId = blockFromIndex.getResourceId();
+ if (!originResourceId.equals(resourceId)) {
+ List<Block> list = collection.get(resourceId);
+ if (list == null) {
+ list = Lists.newArrayList();
+ collection.put(resourceId, list);
+ }
+ list.add(blockFromIndex);
+ }
+ }
+ }
+
+ if (collection.isEmpty() && hashes.size() == fileBlocks.size()) { // optimization for the case when there is no duplications
+ return null;
+ }
+
+ GeneralisedHashText text = new GeneralisedHashText();
+ List<Block> sortedFileBlocks = Lists.newArrayList(fileBlocks);
+ Collections.sort(sortedFileBlocks, BLOCK_COMPARATOR);
+ text.addAll(sortedFileBlocks);
+ text.addTerminator();
+
+ for (List<Block> list : collection.values()) {
+ Collections.sort(list, BLOCK_COMPARATOR);
+
+ int i = 0;
+ while (i < list.size()) {
+ int j = i + 1;
+ while ((j < list.size()) && (list.get(j).getIndexInFile() == list.get(j - 1).getIndexInFile() + 1)) {
+ j++;
+ }
+ text.addAll(list.subList(i, j));
+ text.addTerminator();
+ i = j;
+ }
+ }
+
+ text.finish();
+
+ return text;
+ }
+
+ private static final Comparator<Block> BLOCK_COMPARATOR = new Comparator<Block>() {
+ public int compare(Block o1, Block o2) {
+ return o1.getIndexInFile() - o2.getIndexInFile();
+ }
+ };
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java
new file mode 100644
index 00000000000..19b644d494a
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java
@@ -0,0 +1,41 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.List;
+
+/**
+ * Represents text.
+ */
+public interface Text {
+
+ /**
+ * @return length of the sequence of symbols represented by this object
+ */
+ int length();
+
+ /**
+ * @return symbol at the specified index
+ */
+ Object symbolAt(int index);
+
+ List<Object> sequence(int fromIndex, int toIndex);
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java
new file mode 100644
index 00000000000..aa3628ed048
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java
@@ -0,0 +1,73 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+/**
+ * Simplifies construction of <a href="http://en.wikipedia.org/wiki/Generalised_suffix_tree">generalised suffix-tree</a>.
+ */
+public class TextSet extends AbstractText {
+
+ int[] lens;
+
+ public TextSet(int size) {
+ super(100); // FIXME
+
+ lens = new int[size];
+ }
+
+ public TextSet(Text... text) {
+ this(text.length);
+ for (int i = 0; i < text.length; i++) {
+ symbols.addAll(text[i].sequence(0, text[i].length()));
+ symbols.add(new Terminator(i));
+ lens[i] = symbols.size();
+ }
+ }
+
+ public static class Terminator {
+
+ private final int stringNumber;
+
+ public Terminator(int i) {
+ this.stringNumber = i;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return (obj instanceof Terminator) && (((Terminator) obj).stringNumber == stringNumber);
+ }
+
+ @Override
+ public int hashCode() {
+ return stringNumber;
+ }
+
+ public int getStringNumber() {
+ return stringNumber;
+ }
+
+ @Override
+ public String toString() {
+ return "$" + stringNumber;
+ }
+
+ }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/CloneGroupMatcher.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/CloneGroupMatcher.java
new file mode 100644
index 00000000000..ee4d105e490
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/CloneGroupMatcher.java
@@ -0,0 +1,81 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector;
+
+import org.hamcrest.Description;
+import org.hamcrest.Matcher;
+import org.hamcrest.Matchers;
+import org.hamcrest.TypeSafeMatcher;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.ClonePart;
+
+public class CloneGroupMatcher extends TypeSafeMatcher<CloneGroup> {
+
+ public static Matcher<Iterable<CloneGroup>> hasCloneGroup(int expectedLen, ClonePart... expectedParts) {
+ return Matchers.hasItem(new CloneGroupMatcher(expectedLen, expectedParts));
+ }
+
+ private final int expectedLen;
+ private final ClonePart[] expectedParts;
+
+ private CloneGroupMatcher(int expectedLen, ClonePart... expectedParts) {
+ this.expectedLen = expectedLen;
+ this.expectedParts = expectedParts;
+ }
+
+ public boolean matchesSafely(CloneGroup cloneGroup) {
+ // Check length
+ if (expectedLen != cloneGroup.getCloneUnitLength()) {
+ return false;
+ }
+ // Check number of parts
+ if (expectedParts.length != cloneGroup.getCloneParts().size()) {
+ return false;
+ }
+ // Check origin
+ if (!expectedParts[0].equals(cloneGroup.getOriginPart())) {
+ return false;
+ }
+ // Check parts
+ for (ClonePart expectedPart : expectedParts) {
+ boolean matched = false;
+ for (ClonePart part : cloneGroup.getCloneParts()) {
+ if (part.equals(expectedPart)) {
+ matched = true;
+ break;
+ }
+ }
+ if (!matched) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ public void describeTo(Description description) {
+ StringBuilder builder = new StringBuilder();
+ for (ClonePart part : expectedParts) {
+ builder.append(part).append(" - ");
+ }
+ builder.append(expectedLen);
+ description.appendText(builder.toString());
+ }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/DetectorTestCase.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/DetectorTestCase.java
new file mode 100644
index 00000000000..3a461db1c5e
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/DetectorTestCase.java
@@ -0,0 +1,439 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector;
+
+import static org.hamcrest.Matchers.hasItem;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.sameInstance;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.verifyNoMoreInteractions;
+import static org.sonar.duplications.detector.CloneGroupMatcher.hasCloneGroup;
+
+import java.util.*;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.block.ByteArray;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.CloneIndex;
+import org.sonar.duplications.index.ClonePart;
+import org.sonar.duplications.index.MemoryCloneIndex;
+import org.sonar.duplications.junit.TestNamePrinter;
+
+import com.google.common.collect.Lists;
+
+public abstract class DetectorTestCase {
+
+ @Rule
+ public TestNamePrinter testNamePrinter = new TestNamePrinter();
+
+ protected static int LINES_PER_BLOCK = 5;
+
+ /**
+ * To simplify testing we assume that each block starts from a new line and contains {@link #LINES_PER_BLOCK} lines,
+ * so we can simply use index and hash.
+ */
+ protected static Block newBlock(String resourceId, ByteArray hash, int index) {
+ return new Block(resourceId, hash, index, index, index + LINES_PER_BLOCK);
+ }
+
+ protected static ClonePart newClonePart(String resourceId, int unitStart, int cloneUnitLength) {
+ return new ClonePart(resourceId, unitStart, unitStart, unitStart + cloneUnitLength + LINES_PER_BLOCK - 1);
+ }
+
+ protected abstract List<CloneGroup> detect(CloneIndex index, List<Block> fileBlocks);
+
+ /**
+ * Given:
+ * <pre>
+ * y: 2 3 4 5
+ * z: 3 4
+ * x: 1 2 3 4 5 6
+ * </pre>
+ * Expected:
+ * <pre>
+ * x-y (2 3 4 5)
+ * x-y-z (3 4)
+ * </pre>
+ */
+ @Test
+ public void exampleFromPaper() {
+ CloneIndex index = createIndex(
+ newBlocks("y", "2 3 4 5"),
+ newBlocks("z", "3 4"));
+ List<Block> fileBlocks = newBlocks("x", "1 2 3 4 5 6");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertEquals(2, result.size());
+
+ assertThat(result, hasCloneGroup(4,
+ newClonePart("x", 1, 4),
+ newClonePart("y", 0, 4)));
+
+ assertThat(result, hasCloneGroup(2,
+ newClonePart("x", 2, 2),
+ newClonePart("y", 1, 2),
+ newClonePart("z", 0, 2)));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * a: 2 3 4 5
+ * b: 3 4
+ * c: 1 2 3 4 5 6
+ * </pre>
+ * Expected:
+ * <pre>
+ * c-a (2 3 4 5)
+ * c-a-b (3 4)
+ * </pre>
+ */
+ @Test
+ public void exampleFromPaperWithModifiedResourceIds() {
+ CloneIndex cloneIndex = createIndex(
+ newBlocks("a", "2 3 4 5"),
+ newBlocks("b", "3 4"));
+ List<Block> fileBlocks = newBlocks("c", "1 2 3 4 5 6");
+ List<CloneGroup> clones = detect(cloneIndex, fileBlocks);
+
+ print(clones);
+ assertThat(clones.size(), is(2));
+
+ assertThat(clones, hasCloneGroup(4,
+ newClonePart("c", 1, 4),
+ newClonePart("a", 0, 4)));
+
+ assertThat(clones, hasCloneGroup(2,
+ newClonePart("c", 2, 2),
+ newClonePart("a", 1, 2),
+ newClonePart("b", 0, 2)));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * b: 3 4 5 6
+ * c: 5 6 7
+ * a: 1 2 3 4 5 6 7 8 9
+ * </pre>
+ * Expected:
+ * <pre>
+ * a-b (3 4 5 6)
+ * a-b-c (5 6)
+ * a-c (5 6 7)
+ * </pre>
+ */
+ @Test
+ public void example1() {
+ CloneIndex index = createIndex(
+ newBlocks("b", "3 4 5 6"),
+ newBlocks("c", "5 6 7"));
+ List<Block> fileBlocks = newBlocks("a", "1 2 3 4 5 6 7 8 9");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertThat(result.size(), is(3));
+
+ assertThat(result, hasCloneGroup(4,
+ newClonePart("a", 2, 4),
+ newClonePart("b", 0, 4)));
+
+ assertThat(result, hasCloneGroup(3,
+ newClonePart("a", 4, 3),
+ newClonePart("c", 0, 3)));
+
+ assertThat(result, hasCloneGroup(2,
+ newClonePart("a", 4, 2),
+ newClonePart("b", 2, 2),
+ newClonePart("c", 0, 2)));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * b: 1 2 3 4 1 2 3 4 1 2 3 4
+ * c: 1 2 3 4
+ * a: 1 2 3 5
+ * </pre>
+ * Expected:
+ * <pre>
+ * a-b-b-b-c (1 2 3)
+ * </pre>
+ */
+ @Test
+ public void example2() {
+ CloneIndex index = createIndex(
+ newBlocks("b", "1 2 3 4 1 2 3 4 1 2 3 4"),
+ newBlocks("c", "1 2 3 4"));
+ List<Block> fileBlocks = newBlocks("a", "1 2 3 5");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertThat(result.size(), is(1));
+
+ assertThat(result, hasCloneGroup(3,
+ newClonePart("a", 0, 3),
+ newClonePart("b", 0, 3),
+ newClonePart("b", 4, 3),
+ newClonePart("b", 8, 3),
+ newClonePart("c", 0, 3)));
+ }
+
+ /**
+ * Test for problem, which was described in original paper - same clone would be reported twice.
+ * Given:
+ * <pre>
+ * a: 1 2 3 1 2 4
+ * </pre>
+ * Expected only one clone:
+ * <pre>
+ * a-a (1 2)
+ * </pre>
+ */
+ @Test
+ public void clonesInFileItself() {
+ CloneIndex index = createIndex();
+ List<Block> fileBlocks = newBlocks("a", "1 2 3 1 2 4");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertThat(result.size(), is(1));
+
+ assertThat(result, hasCloneGroup(2,
+ newClonePart("a", 0, 2),
+ newClonePart("a", 3, 2)));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * b: 1 2 1 2
+ * a: 1 2 1
+ * </pre>
+ * Expected:
+ * <pre>
+ * a-b-b (1 2)
+ * a-b (1 2 1)
+ * </pre>
+ * "a-a-b-b (1)" should not be reported, because fully covered by "a-b (1 2 1)"
+ */
+ @Test
+ public void covered() {
+ CloneIndex index = createIndex(
+ newBlocks("b", "1 2 1 2"));
+ List<Block> fileBlocks = newBlocks("a", "1 2 1");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertThat(result.size(), is(2));
+
+ assertThat(result, hasCloneGroup(3,
+ newClonePart("a", 0, 3),
+ newClonePart("b", 0, 3)));
+
+ assertThat(result, hasCloneGroup(2,
+ newClonePart("a", 0, 2),
+ newClonePart("b", 0, 2),
+ newClonePart("b", 2, 2)));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * b: 1 2 1 2 1 2 1
+ * a: 1 2 1 2 1 2
+ * </pre>
+ * Expected:
+ * <pre>
+ * a-b-b (1 2 1 2 1) - note that there is overlapping among parts for "b"
+ * a-b (1 2 1 2 1 2)
+ * </pre>
+ */
+ @Test
+ public void problemWithNestedCloneGroups() {
+ CloneIndex index = createIndex(
+ newBlocks("b", "1 2 1 2 1 2 1"));
+ List<Block> fileBlocks = newBlocks("a", "1 2 1 2 1 2");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertThat(result.size(), is(2));
+
+ assertThat(result, hasCloneGroup(6,
+ newClonePart("a", 0, 6),
+ newClonePart("b", 0, 6)));
+
+ assertThat(result, hasCloneGroup(5,
+ newClonePart("a", 0, 5),
+ newClonePart("b", 0, 5),
+ newClonePart("b", 2, 5)));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * a: 1 2 3
+ * b: 1 2 4
+ * a: 1 2 5
+ * </pre>
+ * Expected:
+ * <pre>
+ * a-b (1 2) - instead of "a-a-b", which will be the case if file from index not ignored
+ * </pre>
+ */
+ @Test
+ public void fileAlreadyInIndex() {
+ CloneIndex index = createIndex(
+ newBlocks("a", "1 2 3"),
+ newBlocks("b", "1 2 4"));
+ // Note about blocks with hashes "3", "4" and "5": those blocks here in order to not face another problem - with EOF (see separate test)
+ List<Block> fileBlocks = newBlocks("a", "1 2 5");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertThat(result.size(), is(1));
+
+ assertThat(result, hasCloneGroup(2,
+ newClonePart("a", 0, 2),
+ newClonePart("b", 0, 2)));
+ }
+
+ /**
+ * Given: file with repeated hashes
+ * Expected: only one query of index for each unique hash
+ */
+ @Test
+ public void only_one_query_of_index_for_each_unique_hash() {
+ CloneIndex index = spy(createIndex());
+ List<Block> fileBlocks = newBlocks("a", "1 2 1 2");
+ detect(index, fileBlocks);
+
+ verify(index).getBySequenceHash(new ByteArray("01"));
+ verify(index).getBySequenceHash(new ByteArray("02"));
+ verifyNoMoreInteractions(index);
+ }
+
+ /**
+ * Given: empty list of blocks for file
+ * Expected: {@link Collections#EMPTY_LIST}
+ */
+ @Test
+ public void shouldReturnEmptyListWhenNoBlocksForFile() {
+ List<CloneGroup> result = detect(null, new ArrayList<Block>());
+ assertThat(result, sameInstance(Collections.EMPTY_LIST));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * b: 1 2 3 4
+ * a: 1 2 3
+ * </pre>
+ * Expected clone which ends at the end of file "a":
+ * <pre>
+ * a-b (1 2 3)
+ * </pre>
+ */
+ @Test
+ public void problemWithEndOfFile() {
+ CloneIndex cloneIndex = createIndex(
+ newBlocks("b", "1 2 3 4"));
+ List<Block> fileBlocks =
+ newBlocks("a", "1 2 3");
+ List<CloneGroup> clones = detect(cloneIndex, fileBlocks);
+
+ print(clones);
+ assertThat(clones.size(), is(1));
+
+ assertThat(clones, hasCloneGroup(3,
+ newClonePart("a", 0, 3),
+ newClonePart("b", 0, 3)));
+ }
+
+ /**
+ * Given file with two lines, containing following statements:
+ * <pre>
+ * 0: A,B,A,B
+ * 1: A,B,A
+ * </pre>
+ * with block size 5 each block will span both lines, and hashes will be:
+ * <pre>
+ * A,B,A,B,A=1
+ * B,A,B,A,B=2
+ * A,B,A,B,A=1
+ * </pre>
+ * Expected: one clone with two parts, which contain exactly the same lines
+ */
+ @Test
+ public void same_lines_but_different_indexes() {
+ CloneIndex cloneIndex = createIndex();
+ List<Block> fileBlocks = Arrays.asList(
+ new Block("a", new ByteArray("1".getBytes()), 0, 0, 1),
+ new Block("a", new ByteArray("2".getBytes()), 1, 0, 1),
+ new Block("a", new ByteArray("1".getBytes()), 2, 0, 1));
+ List<CloneGroup> clones = detect(cloneIndex, fileBlocks);
+
+ print(clones);
+ assertThat(clones.size(), is(1));
+ Iterator<CloneGroup> clonesIterator = clones.iterator();
+
+ CloneGroup clone = clonesIterator.next();
+ assertThat(clone.getCloneUnitLength(), is(1));
+ assertThat(clone.getCloneParts().size(), is(2));
+ assertThat(clone.getOriginPart(), is(new ClonePart("a", 0, 0, 1)));
+ assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 0, 0, 1)));
+ assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 2, 0, 1)));
+ }
+
+ protected static void print(List<CloneGroup> clones) {
+ for (CloneGroup clone : clones) {
+ System.out.println(clone);
+ }
+ System.out.println();
+ }
+
+ protected static List<Block> newBlocks(String resourceId, String hashes) {
+ List<Block> result = Lists.newArrayList();
+ int indexInFile = 0;
+ for (int i = 0; i < hashes.length(); i += 2) {
+ Block block = newBlock(resourceId, new ByteArray("0" + hashes.charAt(i)), indexInFile);
+ result.add(block);
+ indexInFile++;
+ }
+ return result;
+ }
+
+ protected static CloneIndex createIndex(List<Block>... blocks) {
+ CloneIndex cloneIndex = new MemoryCloneIndex();
+ for (List<Block> b : blocks) {
+ for (Block block : b) {
+ cloneIndex.insert(block);
+ }
+ }
+ return cloneIndex;
+ }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java
index e22faf753a9..54d3f87ab18 100644
--- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java
@@ -19,494 +19,18 @@
*/
package org.sonar.duplications.detector.original;
-import static org.hamcrest.Matchers.hasItem;
-import static org.hamcrest.Matchers.is;
-import static org.hamcrest.Matchers.sameInstance;
-import static org.junit.Assert.assertThat;
-import static org.mockito.Mockito.spy;
-import static org.mockito.Mockito.verify;
-import static org.mockito.Mockito.verifyNoMoreInteractions;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Iterator;
import java.util.List;
-import org.junit.Rule;
-import org.junit.Test;
import org.sonar.duplications.block.Block;
-import org.sonar.duplications.block.ByteArray;
+import org.sonar.duplications.detector.DetectorTestCase;
import org.sonar.duplications.index.CloneGroup;
import org.sonar.duplications.index.CloneIndex;
-import org.sonar.duplications.index.ClonePart;
-import org.sonar.duplications.index.MemoryCloneIndex;
-import org.sonar.duplications.junit.TestNamePrinter;
-
-import com.google.common.collect.Lists;
-
-public class OriginalCloneDetectionAlgorithmTest {
-
- @Rule
- public TestNamePrinter name = new TestNamePrinter();
-
- private static int LINES_PER_BLOCK = 5;
-
- /**
- * To simplify testing we assume that each block starts from a new line and contains {@link #LINES_PER_BLOCK} lines,
- * so we can simply use index and hash.
- */
- private static Block newBlock(String resourceId, ByteArray hash, int index) {
- return new Block(resourceId, hash, index, index, index + LINES_PER_BLOCK);
- }
-
- private static ClonePart newClonePart(String resourceId, int unitStart, int cloneUnitLength) {
- return new ClonePart(resourceId, unitStart, unitStart, unitStart + cloneUnitLength + LINES_PER_BLOCK - 1);
- }
-
- /**
- * Given:
- * <pre>
- * y: 2 3 4 5
- * z: 3 4
- * x: 1 2 3 4 5 6
- * </pre>
- * Expected:
- * <pre>
- * x-y (2 3 4 5)
- * x-y-z (3 4)
- * </pre>
- */
- @Test
- public void exampleFromPaper() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("y").withHashes("2", "3", "4", "5"),
- blocksForResource("z").withHashes("3", "4"));
- List<Block> fileBlocks = blocksForResource("x").withHashes("1", "2", "3", "4", "5", "6");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
- assertThat(clones.size(), is(2));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(4));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("x", 1, 4)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("x", 1, 4)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("y", 0, 4)));
-
- clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(2));
- assertThat(clone.getCloneParts().size(), is(3));
- assertThat(clone.getOriginPart(), is(newClonePart("x", 2, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("x", 2, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("y", 1, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("z", 0, 2)));
- }
-
- /**
- * Given:
- * <pre>
- * a: 2 3 4 5
- * b: 3 4
- * c: 1 2 3 4 5 6
- * </pre>
- * Expected:
- * <pre>
- * c-a (2 3 4 5)
- * c-a-b (3 4)
- * </pre>
- */
- @Test
- public void exampleFromPaperWithModifiedResourceIds() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("a").withHashes("2", "3", "4", "5"),
- blocksForResource("b").withHashes("3", "4"));
- List<Block> fileBlocks = blocksForResource("c").withHashes("1", "2", "3", "4", "5", "6");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
- assertThat(clones.size(), is(2));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(4));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("c", 1, 4)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 1, 4)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 4)));
-
- clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(2));
- assertThat(clone.getCloneParts().size(), is(3));
- assertThat(clone.getOriginPart(), is(newClonePart("c", 2, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 2, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 1, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 2)));
- }
-
- /**
- * Given:
- * <pre>
- * b: 3 4 5 6
- * c: 5 6 7
- * a: 1 2 3 4 5 6 7 8 9
- * </pre>
- * Expected:
- * <pre>
- * a-b (3 4 5 6)
- * a-b-c (5 6)
- * a-c (5 6 7)
- * </pre>
- */
- @Test
- public void example1() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("b").withHashes("3", "4", "5", "6"),
- blocksForResource("c").withHashes("5", "6", "7"));
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "3", "4", "5", "6", "7", "8", "9");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
- assertThat(clones.size(), is(3));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(4));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 2, 4)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 2, 4)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 4)));
-
- clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(2));
- assertThat(clone.getCloneParts().size(), is(3));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 4, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 4, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 2, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 0, 2)));
-
- clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(3));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 4, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 4, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 0, 3)));
- }
-
- /**
- * Given:
- * <pre>
- * b: 1 2 3 4 1 2 3 4 1 2 3 4
- * c: 1 2 3 4
- * a: 1 2 3 4 5
- * </pre>
- * Expected:
- * <pre>
- * a-b-b-b-c (1 2 3 4)
- * </pre>
- */
- @Test
- public void example2() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("b").withHashes("1", "2", "3", "4", "1", "2", "3", "4", "1", "2", "3", "4"),
- blocksForResource("c").withHashes("1", "2", "3", "4"));
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "3", "5");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
- assertThat(clones.size(), is(1));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(3));
- assertThat(clone.getCloneParts().size(), is(5));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 4, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 8, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 0, 3)));
- }
-
- /**
- * Given:
- * <pre>
- * b: 1 2 3 4
- * a: 1 2 3
- * </pre>
- * Expected clone which ends at the end of file "a":
- * <pre>
- * a-b (1 2 3)
- * </pre>
- */
- @Test
- public void problemWithEndOfFile() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("b").withHashes("1", "2", "3", "4"));
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "3");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
- assertThat(clones.size(), is(1));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(3));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 3)));
- }
-
- /**
- * Test for problem, which was described in original paper - same clone would be reported twice.
- * Given:
- * <pre>
- * a: 1 2 3 1 2 4
- * </pre>
- * Expected only one clone:
- * <pre>
- * a-a (1 2)
- * </pre>
- */
- @Test
- public void clonesInFileItself() {
- CloneIndex cloneIndex = createIndex();
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "3", "1", "2", "4");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
-
- assertThat(clones.size(), is(1));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(2));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 3, 2)));
- }
-
- /**
- * Given:
- * <pre>
- * b: 1 2 1 2
- * a: 1 2 1
- * </pre>
- * Expected:
- * <pre>
- * a-b-b (1 2)
- * a-b (1 2 1)
- * </pre>
- * "a-a-b-b (1)" should not be reported, because fully covered by "a-b (1 2 1)"
- */
- @Test
- public void covered() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("b").withHashes("1", "2", "1", "2"));
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "1");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
-
- assertThat(clones.size(), is(2));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 2, 2)));
-
- clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(3));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 3)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 3)));
- }
-
- /**
- * Given:
- * <pre>
- * b: 1 2 1 2 1 2 1
- * a: 1 2 1 2 1 2
- * </pre>
- * Expected:
- * <pre>
- * a-b-b (1 2 1 2 1) - note that there is overlapping among parts for "b"
- * a-b (1 2 1 2 1 2)
- * </pre>
- */
- @Test
- public void problemWithNestedCloneGroups() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("b").withHashes("1", "2", "1", "2", "1", "2", "1"));
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "1", "2", "1", "2");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
-
- assertThat(clones.size(), is(2));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(5));
- assertThat(clone.getCloneParts().size(), is(3));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 5)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 5)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 5)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 2, 5)));
-
- clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(6));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 6)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 6)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 6)));
- }
-
- /**
- * Given:
- * <pre>
- * a: 1 2 3
- * b: 1 2 4
- * a: 1 2 5
- * </pre>
- * Expected:
- * <pre>
- * a-b (1 2) - instead of "a-a-b", which will be the case if file from index not ignored
- * </pre>
- */
- @Test
- public void fileAlreadyInIndex() {
- CloneIndex cloneIndex = createIndex(
- blocksForResource("a").withHashes("1", "2", "3"),
- blocksForResource("b").withHashes("1", "2", "4"));
- // Note about blocks with hashes "3", "4" and "5": those blocks here in order to not face another problem - with EOF (see separate test)
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "5");
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
-
- assertThat(clones.size(), is(1));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(2));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 2)));
- assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 2)));
- }
-
- /**
- * Given: file with repeated hashes
- * Expected: only one query of index for each unique hash
- */
- @Test
- public void only_one_query_of_index_for_each_unique_hash() {
- CloneIndex index = spy(createIndex());
- List<Block> fileBlocks =
- blocksForResource("a").withHashes("1", "2", "1", "2");
- OriginalCloneDetectionAlgorithm.detect(index, fileBlocks);
-
- verify(index).getBySequenceHash(new ByteArray("1".getBytes()));
- verify(index).getBySequenceHash(new ByteArray("2".getBytes()));
- verifyNoMoreInteractions(index);
- }
-
- /**
- * Given file with two lines, containing following statements:
- * <pre>
- * 0: A,B,A,B
- * 1: A,B,A
- * </pre>
- * with block size 5 each block will span both lines, and hashes will be:
- * <pre>
- * A,B,A,B,A=1
- * B,A,B,A,B=2
- * A,B,A,B,A=1
- * </pre>
- * Expected: one clone with two parts, which contain exactly the same lines
- */
- @Test
- public void same_lines_but_different_indexes() {
- CloneIndex cloneIndex = createIndex();
- List<Block> fileBlocks = Arrays.asList(
- new Block("a", new ByteArray("1".getBytes()), 0, 0, 1),
- new Block("a", new ByteArray("2".getBytes()), 1, 0, 1),
- new Block("a", new ByteArray("1".getBytes()), 2, 0, 1));
- List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks);
- print(clones);
-
- assertThat(clones.size(), is(1));
- Iterator<CloneGroup> clonesIterator = clones.iterator();
-
- CloneGroup clone = clonesIterator.next();
- assertThat(clone.getCloneUnitLength(), is(1));
- assertThat(clone.getCloneParts().size(), is(2));
- assertThat(clone.getOriginPart(), is(new ClonePart("a", 0, 0, 1)));
- assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 0, 0, 1)));
- assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 2, 0, 1)));
- }
-
- /**
- * Given: empty list of blocks for file
- * Expected: {@link Collections#EMPTY_LIST}
- */
- @Test
- public void shouldReturnEmptyListWhenNoBlocksForFile() {
- List<CloneGroup> result = OriginalCloneDetectionAlgorithm.detect(null, new ArrayList<Block>());
- assertThat(result, sameInstance(Collections.EMPTY_LIST));
- }
-
- private void print(List<CloneGroup> clones) {
- for (CloneGroup clone : clones) {
- System.out.println(clone);
- }
- System.out.println();
- }
-
- private static CloneIndex createIndex(List<Block>... blocks) {
- CloneIndex cloneIndex = new MemoryCloneIndex();
- for (List<Block> b : blocks) {
- for (Block block : b) {
- cloneIndex.insert(block);
- }
- }
- return cloneIndex;
- }
-
- private static BlocksBuilder blocksForResource(String resourceId) {
- return new BlocksBuilder(resourceId);
- }
-
- private static class BlocksBuilder {
- String resourceId;
-
- public BlocksBuilder(String resourceId) {
- this.resourceId = resourceId;
- }
- List<Block> withHashes(String... hashes) {
- ByteArray[] arrays = new ByteArray[hashes.length];
- for (int i = 0; i < hashes.length; i++) {
- arrays[i] = new ByteArray(hashes[i].getBytes());
- }
- return withHashes(arrays);
- }
+public class OriginalCloneDetectionAlgorithmTest extends DetectorTestCase {
- List<Block> withHashes(ByteArray... hashes) {
- List<Block> result = Lists.newArrayList();
- int index = 0;
- for (ByteArray hash : hashes) {
- result.add(newBlock(resourceId, hash, index));
- index++;
- }
- return result;
- }
+ @Override
+ protected List<CloneGroup> detect(CloneIndex index, List<Block> fileBlocks) {
+ return OriginalCloneDetectionAlgorithm.detect(index, fileBlocks);
}
}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java
new file mode 100644
index 00000000000..ab4b0b9cd8f
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java
@@ -0,0 +1,47 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import org.sonar.duplications.detector.suffixtree.Search;
+import org.sonar.duplications.detector.suffixtree.TextSet;
+
+public final class PrintCollector implements Search.Collector {
+
+ private final TextSet text;
+ private int groups;
+
+ public PrintCollector(TextSet text) {
+ this.text = text;
+ }
+
+ public void part(int start, int end, int len) {
+ System.out.println(start + " " + end + " : " + text.sequence(start, end));
+ }
+
+ public void endOfGroup() {
+ groups++;
+ System.out.println();
+ }
+
+ public int getGroups() {
+ return groups;
+ }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java
new file mode 100644
index 00000000000..80daa78225b
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java
@@ -0,0 +1,96 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import org.sonar.duplications.detector.suffixtree.Edge;
+import org.sonar.duplications.detector.suffixtree.Node;
+import org.sonar.duplications.detector.suffixtree.SuffixTree;
+import org.sonar.duplications.detector.suffixtree.Text;
+
+import com.google.common.base.Objects;
+
+public class StringSuffixTree {
+
+ private final SuffixTree suffixTree;
+
+ public static StringSuffixTree create(String text) {
+ return new StringSuffixTree(text);
+ }
+
+ private StringSuffixTree(String text) {
+ suffixTree = SuffixTree.create(new StringText(text));
+ }
+
+ public int indexOf(String str) {
+ return indexOf(suffixTree, new StringText(str));
+ }
+
+ public boolean contains(String str) {
+ return contains(suffixTree, new StringText(str));
+ }
+
+ public SuffixTree getSuffixTree() {
+ return suffixTree;
+ }
+
+ public static boolean contains(SuffixTree tree, Text str) {
+ return indexOf(tree, str) >= 0;
+ }
+
+ public static int indexOf(SuffixTree tree, Text str) {
+ if (str.length() == 0) {
+ return -1;
+ }
+
+ int index = -1;
+ Node node = tree.getRootNode();
+
+ int i = 0;
+ while (i < str.length()) {
+ if (node == null) {
+ return -1;
+ }
+ if (i == tree.text.length()) {
+ return -1;
+ }
+
+ Edge edge = node.findEdge(str.symbolAt(i));
+ if (edge == null) {
+ return -1;
+ }
+
+ index = edge.getBeginIndex() - i;
+ i++;
+
+ for (int j = edge.getBeginIndex() + 1; j <= edge.getEndIndex(); j++) {
+ if (i == str.length()) {
+ break;
+ }
+ if (!Objects.equal(tree.symbolAt(j), str.symbolAt(i))) {
+ return -1;
+ }
+ i++;
+ }
+ node = edge.getEndNode();
+ }
+ return index;
+ }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java
new file mode 100644
index 00000000000..7cd796c755f
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java
@@ -0,0 +1,106 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+public class StringSuffixTreeTest {
+
+ @Test
+ public void testMississippi() {
+ StringSuffixTree st = StringSuffixTree.create("mississippi");
+
+ assertTrue(st.contains("miss"));
+ assertTrue(st.contains("missis"));
+ assertTrue(st.contains("pi"));
+ }
+
+ @Test
+ public void testBanana() {
+ StringSuffixTree st = StringSuffixTree.create("banana");
+
+ assertTrue(st.contains("ana"));
+ assertTrue(st.contains("an"));
+ assertTrue(st.contains("na"));
+ }
+
+ @Test
+ public void testBook() {
+ StringSuffixTree st = StringSuffixTree.create("book");
+
+ assertTrue(st.contains("book"));
+ assertTrue(st.contains("oo"));
+ assertTrue(st.contains("ok"));
+ assertFalse(st.contains("okk"));
+ assertFalse(st.contains("bookk"));
+ assertFalse(st.contains("bok"));
+
+ assertEquals(0, st.indexOf("book"));
+ assertEquals(1, st.indexOf("oo"));
+ assertEquals(2, st.indexOf("ok"));
+ }
+
+ @Test
+ public void testBookke() {
+ StringSuffixTree st = StringSuffixTree.create("bookke");
+
+ assertTrue(st.contains("bookk"));
+
+ assertEquals(0, st.indexOf("book"));
+ assertEquals(1, st.indexOf("oo"));
+ assertEquals(2, st.indexOf("ok"));
+ }
+
+ @Test
+ public void testCacao() {
+ StringSuffixTree st = StringSuffixTree.create("cacao");
+
+ assertTrue(st.contains("aca"));
+
+ assertEquals(3, st.indexOf("ao"));
+ assertEquals(0, st.indexOf("ca"));
+ assertEquals(2, st.indexOf("cao"));
+ }
+
+ @Test
+ public void testGoogol() {
+ StringSuffixTree st = StringSuffixTree.create("googol");
+
+ assertTrue(st.contains("oo"));
+
+ assertEquals(0, st.indexOf("go"));
+ assertEquals(3, st.indexOf("gol"));
+ assertEquals(1, st.indexOf("oo"));
+ }
+
+ @Test
+ public void testAbababc() {
+ StringSuffixTree st = StringSuffixTree.create("abababc");
+
+ assertTrue(st.contains("aba"));
+
+ assertEquals(0, st.indexOf("aba"));
+ assertEquals(4, st.indexOf("abc"));
+ }
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringText.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringText.java
new file mode 100644
index 00000000000..7828f03494e
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringText.java
@@ -0,0 +1,37 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import org.sonar.duplications.detector.suffixtree.AbstractText;
+import org.sonar.duplications.detector.suffixtree.Text;
+
+/**
+ * Implementation of {@link Text} based on {@link String}.
+ */
+public class StringText extends AbstractText {
+
+ public StringText(String text) {
+ super(text.length());
+ for (int i = 0; i < text.length(); i++) {
+ symbols.add(Character.valueOf(text.charAt(i)));
+ }
+ }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java
new file mode 100644
index 00000000000..572811ef3df
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java
@@ -0,0 +1,120 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import static org.hamcrest.Matchers.sameInstance;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+import static org.sonar.duplications.detector.CloneGroupMatcher.hasCloneGroup;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.junit.Test;
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.detector.DetectorTestCase;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.CloneIndex;
+
+public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase {
+
+ /**
+ * Given: file without duplications
+ * Expected: {@link Collections#EMPTY_LIST} (no need to construct suffix-tree)
+ */
+ @Test
+ public void noDuplications() {
+ CloneIndex index = createIndex();
+ List<Block> fileBlocks = newBlocks("a", "1 2 3");
+ List<CloneGroup> result = detect(index, fileBlocks);
+ assertThat(result, sameInstance(Collections.EMPTY_LIST));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * x: a 2 b 2 c 2 2 2
+ * </pre>
+ * Expected:
+ * <pre>
+ * x-x (2 2)
+ * x-x-x-x-x (2)
+ * <pre>
+ */
+ @Test
+ public void myTest() {
+ CloneIndex index = createIndex();
+ List<Block> fileBlocks = newBlocks("x", "a 2 b 2 c 2 2 2");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertEquals(2, result.size());
+
+ assertThat(result, hasCloneGroup(2,
+ newClonePart("x", 5, 2),
+ newClonePart("x", 6, 2)));
+
+ assertThat(result, hasCloneGroup(1,
+ newClonePart("x", 1, 1),
+ newClonePart("x", 3, 1),
+ newClonePart("x", 5, 1),
+ newClonePart("x", 6, 1),
+ newClonePart("x", 7, 1)));
+ }
+
+ /**
+ * Given:
+ * <pre>
+ * x: a 2 3 b 2 3 c 2 3 d 2 3 2 3 2 3
+ * </pre>
+ * Expected:
+ * <pre>
+ * x-x (2 3 2 3)
+ * x-x-x-x-x-x (2 3)
+ * <pre>
+ */
+ @Test
+ public void myTest2() {
+ CloneIndex index = createIndex();
+ List<Block> fileBlocks = newBlocks("x", "a 2 3 b 2 3 c 2 3 d 2 3 2 3 2 3");
+ List<CloneGroup> result = detect(index, fileBlocks);
+
+ print(result);
+ assertEquals(2, result.size());
+
+ assertThat(result, hasCloneGroup(4,
+ newClonePart("x", 10, 4),
+ newClonePart("x", 12, 4)));
+
+ assertThat(result, hasCloneGroup(2,
+ newClonePart("x", 1, 2),
+ newClonePart("x", 4, 2),
+ newClonePart("x", 7, 2),
+ newClonePart("x", 10, 2),
+ newClonePart("x", 12, 2),
+ newClonePart("x", 14, 2)));
+ }
+
+ @Override
+ protected List<CloneGroup> detect(CloneIndex index, List<Block> fileBlocks) {
+ return SuffixTreeCloneDetectionAlgorithm.detect(index, fileBlocks);
+ }
+
+}