aboutsummaryrefslogtreecommitdiffstats
path: root/sonar-duplications/src/main/java/org/sonar
diff options
context:
space:
mode:
authorEvgeny Mandrikov <mandrikov@gmail.com>2011-12-07 02:37:47 +0400
committerEvgeny Mandrikov <mandrikov@gmail.com>2011-12-07 03:31:04 +0400
commitf0d96c1053f6345e82351522c5bd8ce5633008bd (patch)
treeb984e5f755b82ee21073b7ab967d314b22a34fe4 /sonar-duplications/src/main/java/org/sonar
parent9ae42f76ea1c21b43e3ea6ec044fe45276be9875 (diff)
downloadsonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.tar.gz
sonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.zip
SONAR-3060 Add new CPD algorithm based on suffix tree
Diffstat (limited to 'sonar-duplications/src/main/java/org/sonar')
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java45
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java200
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java79
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java77
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java88
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java137
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java86
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java109
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java111
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java41
-rw-r--r--sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java73
11 files changed, 1046 insertions, 0 deletions
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java
new file mode 100644
index 00000000000..5a3ffa0c14c
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java
@@ -0,0 +1,45 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public abstract class AbstractText implements Text {
+
+ protected final List<Object> symbols;
+
+ public AbstractText(int size) {
+ this.symbols = new ArrayList<Object>(size);
+ }
+
+ public int length() {
+ return symbols.size();
+ }
+
+ public Object symbolAt(int index) {
+ return symbols.get(index);
+ }
+
+ public List<Object> sequence(int fromIndex, int toIndex) {
+ return symbols.subList(fromIndex, toIndex);
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java
new file mode 100644
index 00000000000..745fb90a3b3
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java
@@ -0,0 +1,200 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.ClonePart;
+import org.sonar.duplications.utils.FastStringComparator;
+import org.sonar.duplications.utils.SortedListsUtils;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s.
+ */
+public class DuplicationsCollector implements Search.Collector {
+
+ private final GeneralisedHashText text;
+ private final String originResourceId;
+
+ /**
+ * Note that LinkedList should provide better performance here, because of use of operation remove.
+ *
+ * @see #filter(CloneGroup)
+ */
+ private final List<CloneGroup> filtered = Lists.newLinkedList();
+
+ private int length;
+ private ClonePart origin;
+ private final List<ClonePart> parts = Lists.newArrayList();
+
+ public DuplicationsCollector(GeneralisedHashText text) {
+ this.text = text;
+ this.originResourceId = text.getBlock(0).getResourceId();
+ }
+
+ /**
+ * @return current result
+ */
+ public List<CloneGroup> getResult() {
+ return filtered;
+ }
+
+ public void part(int start, int end, int len) {
+ length = len;
+ Block firstBlock = text.getBlock(start);
+ Block lastBlock = text.getBlock(end - 1);
+
+ ClonePart part = new ClonePart(
+ firstBlock.getResourceId(),
+ firstBlock.getIndexInFile(),
+ firstBlock.getFirstLineNumber(),
+ lastBlock.getLastLineNumber());
+
+ // TODO Godin: maybe use FastStringComparator here ?
+ if (originResourceId.equals(part.getResourceId())) { // part from origin
+ if (origin == null) {
+ origin = part;
+ } else if (part.getUnitStart() < origin.getUnitStart()) {
+ origin = part;
+ }
+ }
+
+ parts.add(part);
+ }
+
+ public void endOfGroup() {
+ Collections.sort(parts, CLONEPART_COMPARATOR);
+ CloneGroup group = new CloneGroup(length, origin, parts);
+ filter(group);
+
+ parts.clear();
+ origin = null;
+ }
+
+ private void filter(CloneGroup current) {
+ Iterator<CloneGroup> i = filtered.iterator();
+ while (i.hasNext()) {
+ CloneGroup earlier = i.next();
+ // Note that following two conditions cannot be true together - proof by contradiction:
+ // let C be the current clone and A and B were found earlier
+ // then since relation is transitive - (A in C) and (C in B) => (A in B)
+ // so A should be filtered earlier
+ if (containsIn(current, earlier)) {
+ // current clone fully covered by clone, which was found earlier
+ return;
+ }
+ // TODO Godin: must prove that this is unused
+ // if (containsIn(earlier, current)) {
+ // // current clone fully covers clone, which was found earlier
+ // i.remove();
+ // }
+ }
+ filtered.add(current);
+ }
+
+ private static final Comparator<ClonePart> CLONEPART_COMPARATOR = new Comparator<ClonePart>() {
+ public int compare(ClonePart o1, ClonePart o2) {
+ int c = RESOURCE_ID_COMPARATOR.compare(o1, o2);
+ if (c == 0) {
+ return o1.getUnitStart() - o2.getUnitStart();
+ }
+ return c;
+ }
+ };
+
+ private static final Comparator<ClonePart> RESOURCE_ID_COMPARATOR = new Comparator<ClonePart>() {
+ public int compare(ClonePart o1, ClonePart o2) {
+ return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId());
+ }
+ };
+
+ /**
+ * Checks that second clone contains first one.
+ * <p>
+ * Clone A is contained in another clone B, if every part pA from A has part pB in B,
+ * which satisfy the conditions:
+ * <pre>
+ * (pA.resourceId == pB.resourceId) and (pB.unitStart <= pA.unitStart) and (pA.unitEnd <= pB.unitEnd)
+ * </pre>
+ * And all resourcesId from B exactly the same as all resourceId from A, which means that also every part pB from B has part pA in A,
+ * which satisfy the condition:
+ * <pre>
+ * pB.resourceId == pA.resourceId
+ * </pre>
+ * So this relation is:
+ * <ul>
+ * <li>reflexive - A in A</li>
+ * <li>transitive - (A in B) and (B in C) => (A in C)</li>
+ * <li>antisymmetric - (A in B) and (B in A) <=> (A = B)</li>
+ * </ul>
+ * </p>
+ * <p>
+ * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link #CLONEPART_COMPARATOR}),
+ * so running time - O(|A|+|B|).
+ * </p>
+ */
+ private static boolean containsIn(CloneGroup first, CloneGroup second) {
+ // TODO Godin: must prove that this is unused
+ // if (!first.getOriginPart().getResourceId().equals(second.getOriginPart().getResourceId())) {
+ // return false;
+ // }
+ if (first.getCloneUnitLength() > second.getCloneUnitLength()) {
+ return false;
+ }
+ List<ClonePart> firstParts = first.getCloneParts();
+ List<ClonePart> secondParts = second.getCloneParts();
+ return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength()))
+ && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR);
+ }
+
+ private static class ContainsInComparator implements Comparator<ClonePart> {
+ private final int l1, l2;
+
+ public ContainsInComparator(int l1, int l2) {
+ this.l1 = l1;
+ this.l2 = l2;
+ }
+
+ public int compare(ClonePart o1, ClonePart o2) {
+ int c = RESOURCE_ID_COMPARATOR.compare(o1, o2);
+ if (c == 0) {
+ if (o2.getUnitStart() <= o1.getUnitStart()) {
+ if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) {
+ return 0; // match found - stop search
+ } else {
+ return 1; // continue search
+ }
+ } else {
+ return -1; // o1 < o2 by unitStart - stop search
+ }
+ } else {
+ return c;
+ }
+ }
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java
new file mode 100644
index 00000000000..968fbd9e226
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java
@@ -0,0 +1,79 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+public final class Edge {
+
+ private int beginIndex; // can't be changed
+ private int endIndex;
+ private Node startNode;
+ private Node endNode; // can't be changed, could be used as edge id
+
+ // each time edge is created, a new end node is created
+ public Edge(int beginIndex, int endIndex, Node startNode) {
+ this.beginIndex = beginIndex;
+ this.endIndex = endIndex;
+ this.startNode = startNode;
+ this.endNode = new Node(startNode, null);
+ }
+
+ public Node splitEdge(Suffix suffix) {
+ remove();
+ Edge newEdge = new Edge(beginIndex, beginIndex + suffix.getSpan(), suffix.getOriginNode());
+ newEdge.insert();
+ newEdge.endNode.setSuffixNode(suffix.getOriginNode());
+ beginIndex += suffix.getSpan() + 1;
+ startNode = newEdge.getEndNode();
+ insert();
+ return newEdge.getEndNode();
+ }
+
+ public void insert() {
+ startNode.addEdge(beginIndex, this);
+ }
+
+ public void remove() {
+ startNode.removeEdge(beginIndex);
+ }
+
+ /**
+ * @return length of this edge in symbols
+ */
+ public int getSpan() {
+ return endIndex - beginIndex;
+ }
+
+ public int getBeginIndex() {
+ return beginIndex;
+ }
+
+ public int getEndIndex() {
+ return endIndex;
+ }
+
+ public Node getStartNode() {
+ return startNode;
+ }
+
+ public Node getEndNode() {
+ return endNode;
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java
new file mode 100644
index 00000000000..202cb85084c
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java
@@ -0,0 +1,77 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.List;
+
+import org.sonar.duplications.block.Block;
+
+import com.google.common.collect.Lists;
+
+public class GeneralisedHashText extends TextSet {
+
+ public GeneralisedHashText(List<Block>... blocks) {
+ super(blocks.length);
+
+ for (int i = 0; i < blocks.length; i++) {
+ addAll(blocks[i]);
+ addTerminator();
+ }
+ finish();
+ }
+
+ private int count;
+ private List<Integer> sizes = Lists.newArrayList();
+
+ public void addBlock(Block block) {
+ symbols.add(block);
+ }
+
+ public void addAll(List<Block> list) {
+ symbols.addAll(list);
+ }
+
+ public void addTerminator() {
+ symbols.add(new Terminator(count));
+ sizes.add(symbols.size());
+ count++;
+ }
+
+ public void finish() {
+ super.lens = new int[sizes.size()];
+ for (int i = 0; i < sizes.size(); i++) {
+ super.lens[i] = sizes.get(i);
+ }
+ }
+
+ @Override
+ public Object symbolAt(int index) {
+ Object obj = super.symbolAt(index);
+ if (obj instanceof Block) {
+ return ((Block) obj).getBlockHash();
+ }
+ return obj;
+ }
+
+ public Block getBlock(int index) {
+ return (Block) super.symbolAt(index);
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java
new file mode 100644
index 00000000000..8fa39ecc832
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java
@@ -0,0 +1,88 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Collection;
+
+public final class Node {
+
+ private final SuffixTree suffixTree;
+ private final Map<Object, Edge> edges;
+
+ /**
+ * Node represents string s[i],s[i+1],...,s[j],
+ * suffix-link is a link to node, which represents string s[i+1],...,s[j].
+ */
+ private Node suffixNode;
+
+ /**
+ * Number of symbols from the root to this node.
+ * <p>
+ * Note that this is not equal to number of nodes from root to this node,
+ * because in a compact suffix-tree edge can span multiple symbols - see {@link Edge#getSpan()}.
+ * </p><p>
+ * Depth of {@link #suffixNode} is always equal to this depth minus one.
+ * </p>
+ */
+ int depth;
+
+ int startSize, endSize;
+
+ public Node(Node node, Node suffixNode) {
+ this(node.suffixTree, suffixNode);
+ }
+
+ public Node(SuffixTree suffixTree, Node suffixNode) {
+ this.suffixTree = suffixTree;
+ this.suffixNode = suffixNode;
+ edges = new HashMap<Object, Edge>();
+ }
+
+ public Object symbolAt(int index) {
+ return suffixTree.symbolAt(index);
+ }
+
+ public void addEdge(int charIndex, Edge edge) {
+ edges.put(symbolAt(charIndex), edge);
+ }
+
+ public void removeEdge(int charIndex) {
+ edges.remove(symbolAt(charIndex));
+ }
+
+ public Edge findEdge(Object ch) {
+ return edges.get(ch);
+ }
+
+ public Node getSuffixNode() {
+ return suffixNode;
+ }
+
+ public void setSuffixNode(Node suffixNode) {
+ this.suffixNode = suffixNode;
+ }
+
+ public Collection<Edge> getEdges() {
+ return edges.values();
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java
new file mode 100644
index 00000000000..f13410e90d4
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java
@@ -0,0 +1,137 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.*;
+
+public final class Search {
+
+ private final SuffixTree tree;
+ private final int[] lens;
+ private final Collector reporter;
+
+ private final List<Integer> list = new ArrayList<Integer>();
+ private final List<Node> innerNodes = new ArrayList<Node>();
+
+ public static void perform(TextSet text, Collector reporter) {
+ new Search(SuffixTree.create(text), text.lens, reporter).compute();
+ }
+
+ private Search(SuffixTree tree, int[] lens, Collector reporter) {
+ this.tree = tree;
+ this.lens = lens;
+ this.reporter = reporter;
+ }
+
+ private void compute() {
+ // O(N)
+ computeDepth();
+
+ // O(N * log(N))
+ Collections.sort(innerNodes, DEPTH_COMPARATOR);
+
+ // O(N), recursive
+ createListOfLeafs(tree.getRootNode());
+
+ // O(N)
+ visitInnerNodes();
+ }
+
+ private void computeDepth() {
+ Queue<Node> queue = new LinkedList<Node>();
+ queue.add(tree.getRootNode());
+ tree.getRootNode().depth = 0;
+ while (!queue.isEmpty()) {
+ Node node = queue.remove();
+ if (!node.getEdges().isEmpty()) {
+ if (node != tree.getRootNode()) { // inner node = not leaf and not root
+ innerNodes.add(node);
+ }
+ for (Edge edge : node.getEdges()) {
+ Node endNode = edge.getEndNode();
+ endNode.depth = node.depth + edge.getSpan() + 1;
+ queue.add(endNode);
+ }
+ }
+ }
+ }
+
+ private static final Comparator<Node> DEPTH_COMPARATOR = new Comparator<Node>() {
+ public int compare(Node o1, Node o2) {
+ return o2.depth - o1.depth;
+ }
+ };
+
+ private void createListOfLeafs(Node node) {
+ node.startSize = list.size();
+ if (node.getEdges().isEmpty()) { // leaf
+ list.add(node.depth);
+ } else {
+ for (Edge edge : node.getEdges()) {
+ createListOfLeafs(edge.getEndNode());
+ }
+ node.endSize = list.size();
+ }
+ }
+
+ /**
+ * Each inner-node represents prefix of some suffixes, thus substring of text.
+ */
+ private void visitInnerNodes() {
+ for (Node node : innerNodes) {
+ if (containsOrigin(node)) {
+ report(node);
+ }
+ }
+ }
+
+ private boolean containsOrigin(Node node) {
+ for (int i = node.startSize; i < node.endSize; i++) {
+ int start = tree.text.length() - list.get(i);
+ int end = start + node.depth;
+ if (end < lens[0]) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private void report(Node node) {
+ for (int i = node.startSize; i < node.endSize; i++) {
+ int start = tree.text.length() - list.get(i);
+ int end = start + node.depth;
+ reporter.part(start, end, node.depth);
+ }
+ reporter.endOfGroup();
+ }
+
+ public interface Collector {
+
+ /**
+ * @param start start position in generalised text
+ * @param end end position in generalised text
+ */
+ void part(int start, int end, int len);
+
+ void endOfGroup();
+
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java
new file mode 100644
index 00000000000..2f0165d7d41
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java
@@ -0,0 +1,86 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+public final class Suffix {
+
+ private Node originNode;
+ private int beginIndex;
+ private int endIndex;
+
+ public Suffix(Node originNode, int beginIndex, int endIndex) {
+ this.originNode = originNode;
+ this.beginIndex = beginIndex;
+ this.endIndex = endIndex;
+ }
+
+ public boolean isExplicit() {
+ return beginIndex > endIndex;
+ }
+
+ public boolean isImplicit() {
+ return !isExplicit();
+ }
+
+ public void canonize() {
+ if (isImplicit()) {
+ Edge edge = originNode.findEdge(originNode.symbolAt(beginIndex));
+
+ int edgeSpan = edge.getSpan();
+ while (edgeSpan <= getSpan()) {
+ beginIndex += edgeSpan + 1;
+ originNode = edge.getEndNode();
+ if (beginIndex <= endIndex) {
+ edge = edge.getEndNode().findEdge(originNode.symbolAt(beginIndex));
+ edgeSpan = edge.getSpan();
+ }
+ }
+ }
+ }
+
+ public int getSpan() {
+ return endIndex - beginIndex;
+ }
+
+ public Node getOriginNode() {
+ return originNode;
+ }
+
+ public int getBeginIndex() {
+ return beginIndex;
+ }
+
+ public void incBeginIndex() {
+ beginIndex++;
+ }
+
+ public void changeOriginNode() {
+ originNode = originNode.getSuffixNode();
+ }
+
+ public int getEndIndex() {
+ return endIndex;
+ }
+
+ public void incEndIndex() {
+ endIndex++;
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java
new file mode 100644
index 00000000000..b6e21c940c7
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java
@@ -0,0 +1,109 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import com.google.common.base.Objects;
+
+/**
+ * The implementation of the algorithm for constructing suffix-tree based on
+ * <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of
+ * <a href="http://marknelson.us/1996/08/01/suffix-trees/">Mark Nelson's C++ implementation of Ukkonen's algorithm</a>.
+ */
+public final class SuffixTree {
+
+ final Text text;
+
+ private final Node root;
+
+ public static SuffixTree create(Text text) {
+ SuffixTree tree = new SuffixTree(text);
+ Suffix active = new Suffix(tree.root, 0, -1);
+ for (int i = 0; i < text.length(); i++) {
+ tree.addPrefix(active, i);
+ }
+ return tree;
+ }
+
+ private SuffixTree(Text text) {
+ this.text = text;
+ root = new Node(this, null);
+ }
+
+ private void addPrefix(Suffix active, int endIndex) {
+ Node lastParentNode = null;
+ Node parentNode;
+
+ while (true) {
+ Edge edge;
+ parentNode = active.getOriginNode();
+
+ // Step 1 is to try and find a matching edge for the given node.
+ // If a matching edge exists, we are done adding edges, so we break out of this big loop.
+ if (active.isExplicit()) {
+ edge = active.getOriginNode().findEdge(symbolAt(endIndex));
+ if (edge != null) {
+ break;
+ }
+ } else {
+ // implicit node, a little more complicated
+ edge = active.getOriginNode().findEdge(symbolAt(active.getBeginIndex()));
+ int span = active.getSpan();
+ if (Objects.equal(symbolAt(edge.getBeginIndex() + span + 1), symbolAt(endIndex))) {
+ break;
+ }
+ parentNode = edge.splitEdge(active);
+ }
+
+ // We didn't find a matching edge, so we create a new one, add it to the tree at the parent node position,
+ // and insert it into the hash table. When we create a new node, it also means we need to create
+ // a suffix link to the new node from the last node we visited.
+ Edge newEdge = new Edge(endIndex, text.length() - 1, parentNode);
+ newEdge.insert();
+ updateSuffixNode(lastParentNode, parentNode);
+ lastParentNode = parentNode;
+
+ // This final step is where we move to the next smaller suffix
+ if (active.getOriginNode() == root) {
+ active.incBeginIndex();
+ } else {
+ active.changeOriginNode();
+ }
+ active.canonize();
+ }
+ updateSuffixNode(lastParentNode, parentNode);
+ active.incEndIndex(); // Now the endpoint is the next active point
+ active.canonize();
+ }
+
+ private void updateSuffixNode(Node node, Node suffixNode) {
+ if ((node != null) && (node != root)) {
+ node.setSuffixNode(suffixNode);
+ }
+ }
+
+ public Object symbolAt(int index) {
+ return text.symbolAt(index);
+ }
+
+ public Node getRootNode() {
+ return root;
+ }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java
new file mode 100644
index 00000000000..0bd13a88cbf
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java
@@ -0,0 +1,111 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.*;
+
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.block.ByteArray;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.CloneIndex;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
+public final class SuffixTreeCloneDetectionAlgorithm {
+
+ public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) {
+ if (fileBlocks.isEmpty()) {
+ return Collections.EMPTY_LIST;
+ }
+ GeneralisedHashText text = retrieveFromIndex(cloneIndex, fileBlocks);
+ if (text == null) {
+ return Collections.EMPTY_LIST;
+ }
+ DuplicationsCollector reporter = new DuplicationsCollector(text);
+ Search.perform(text, reporter);
+ return reporter.getResult();
+ }
+
+ private SuffixTreeCloneDetectionAlgorithm() {
+ }
+
+ private static GeneralisedHashText retrieveFromIndex(CloneIndex index, Collection<Block> fileBlocks) {
+ String originResourceId = fileBlocks.iterator().next().getResourceId();
+
+ Set<ByteArray> hashes = Sets.newHashSet();
+ for (Block fileBlock : fileBlocks) {
+ hashes.add(fileBlock.getBlockHash());
+ }
+
+ Map<String, List<Block>> collection = Maps.newHashMap();
+ for (ByteArray hash : hashes) {
+ Collection<Block> blocks = index.getBySequenceHash(hash);
+ for (Block blockFromIndex : blocks) {
+ // Godin: skip blocks for this file if they come from index
+ String resourceId = blockFromIndex.getResourceId();
+ if (!originResourceId.equals(resourceId)) {
+ List<Block> list = collection.get(resourceId);
+ if (list == null) {
+ list = Lists.newArrayList();
+ collection.put(resourceId, list);
+ }
+ list.add(blockFromIndex);
+ }
+ }
+ }
+
+ if (collection.isEmpty() && hashes.size() == fileBlocks.size()) { // optimization for the case when there is no duplications
+ return null;
+ }
+
+ GeneralisedHashText text = new GeneralisedHashText();
+ List<Block> sortedFileBlocks = Lists.newArrayList(fileBlocks);
+ Collections.sort(sortedFileBlocks, BLOCK_COMPARATOR);
+ text.addAll(sortedFileBlocks);
+ text.addTerminator();
+
+ for (List<Block> list : collection.values()) {
+ Collections.sort(list, BLOCK_COMPARATOR);
+
+ int i = 0;
+ while (i < list.size()) {
+ int j = i + 1;
+ while ((j < list.size()) && (list.get(j).getIndexInFile() == list.get(j - 1).getIndexInFile() + 1)) {
+ j++;
+ }
+ text.addAll(list.subList(i, j));
+ text.addTerminator();
+ i = j;
+ }
+ }
+
+ text.finish();
+
+ return text;
+ }
+
+ private static final Comparator<Block> BLOCK_COMPARATOR = new Comparator<Block>() {
+ public int compare(Block o1, Block o2) {
+ return o1.getIndexInFile() - o2.getIndexInFile();
+ }
+ };
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java
new file mode 100644
index 00000000000..19b644d494a
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java
@@ -0,0 +1,41 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import java.util.List;
+
+/**
+ * Represents text.
+ */
+public interface Text {
+
+ /**
+ * @return length of the sequence of symbols represented by this object
+ */
+ int length();
+
+ /**
+ * @return symbol at the specified index
+ */
+ Object symbolAt(int index);
+
+ List<Object> sequence(int fromIndex, int toIndex);
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java
new file mode 100644
index 00000000000..aa3628ed048
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java
@@ -0,0 +1,73 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+/**
+ * Simplifies construction of <a href="http://en.wikipedia.org/wiki/Generalised_suffix_tree">generalised suffix-tree</a>.
+ */
+public class TextSet extends AbstractText {
+
+ int[] lens;
+
+ public TextSet(int size) {
+ super(100); // FIXME
+
+ lens = new int[size];
+ }
+
+ public TextSet(Text... text) {
+ this(text.length);
+ for (int i = 0; i < text.length; i++) {
+ symbols.addAll(text[i].sequence(0, text[i].length()));
+ symbols.add(new Terminator(i));
+ lens[i] = symbols.size();
+ }
+ }
+
+ public static class Terminator {
+
+ private final int stringNumber;
+
+ public Terminator(int i) {
+ this.stringNumber = i;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return (obj instanceof Terminator) && (((Terminator) obj).stringNumber == stringNumber);
+ }
+
+ @Override
+ public int hashCode() {
+ return stringNumber;
+ }
+
+ public int getStringNumber() {
+ return stringNumber;
+ }
+
+ @Override
+ public String toString() {
+ return "$" + stringNumber;
+ }
+
+ }
+
+}