diff options
author | Evgeny Mandrikov <mandrikov@gmail.com> | 2011-12-07 02:37:47 +0400 |
---|---|---|
committer | Evgeny Mandrikov <mandrikov@gmail.com> | 2011-12-07 03:31:04 +0400 |
commit | f0d96c1053f6345e82351522c5bd8ce5633008bd (patch) | |
tree | b984e5f755b82ee21073b7ab967d314b22a34fe4 /sonar-duplications/src | |
parent | 9ae42f76ea1c21b43e3ea6ec044fe45276be9875 (diff) | |
download | sonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.tar.gz sonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.zip |
SONAR-3060 Add new CPD algorithm based on suffix tree
Diffstat (limited to 'sonar-duplications/src')
19 files changed, 1977 insertions, 481 deletions
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java new file mode 100644 index 00000000000..5a3ffa0c14c --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java @@ -0,0 +1,45 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.ArrayList; +import java.util.List; + +public abstract class AbstractText implements Text { + + protected final List<Object> symbols; + + public AbstractText(int size) { + this.symbols = new ArrayList<Object>(size); + } + + public int length() { + return symbols.size(); + } + + public Object symbolAt(int index) { + return symbols.get(index); + } + + public List<Object> sequence(int fromIndex, int toIndex) { + return symbols.subList(fromIndex, toIndex); + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java new file mode 100644 index 00000000000..745fb90a3b3 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java @@ -0,0 +1,200 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.sonar.duplications.block.Block; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.ClonePart; +import org.sonar.duplications.utils.FastStringComparator; +import org.sonar.duplications.utils.SortedListsUtils; + +import com.google.common.collect.Lists; + +/** + * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s. + */ +public class DuplicationsCollector implements Search.Collector { + + private final GeneralisedHashText text; + private final String originResourceId; + + /** + * Note that LinkedList should provide better performance here, because of use of operation remove. + * + * @see #filter(CloneGroup) + */ + private final List<CloneGroup> filtered = Lists.newLinkedList(); + + private int length; + private ClonePart origin; + private final List<ClonePart> parts = Lists.newArrayList(); + + public DuplicationsCollector(GeneralisedHashText text) { + this.text = text; + this.originResourceId = text.getBlock(0).getResourceId(); + } + + /** + * @return current result + */ + public List<CloneGroup> getResult() { + return filtered; + } + + public void part(int start, int end, int len) { + length = len; + Block firstBlock = text.getBlock(start); + Block lastBlock = text.getBlock(end - 1); + + ClonePart part = new ClonePart( + firstBlock.getResourceId(), + firstBlock.getIndexInFile(), + firstBlock.getFirstLineNumber(), + lastBlock.getLastLineNumber()); + + // TODO Godin: maybe use FastStringComparator here ? + if (originResourceId.equals(part.getResourceId())) { // part from origin + if (origin == null) { + origin = part; + } else if (part.getUnitStart() < origin.getUnitStart()) { + origin = part; + } + } + + parts.add(part); + } + + public void endOfGroup() { + Collections.sort(parts, CLONEPART_COMPARATOR); + CloneGroup group = new CloneGroup(length, origin, parts); + filter(group); + + parts.clear(); + origin = null; + } + + private void filter(CloneGroup current) { + Iterator<CloneGroup> i = filtered.iterator(); + while (i.hasNext()) { + CloneGroup earlier = i.next(); + // Note that following two conditions cannot be true together - proof by contradiction: + // let C be the current clone and A and B were found earlier + // then since relation is transitive - (A in C) and (C in B) => (A in B) + // so A should be filtered earlier + if (containsIn(current, earlier)) { + // current clone fully covered by clone, which was found earlier + return; + } + // TODO Godin: must prove that this is unused + // if (containsIn(earlier, current)) { + // // current clone fully covers clone, which was found earlier + // i.remove(); + // } + } + filtered.add(current); + } + + private static final Comparator<ClonePart> CLONEPART_COMPARATOR = new Comparator<ClonePart>() { + public int compare(ClonePart o1, ClonePart o2) { + int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); + if (c == 0) { + return o1.getUnitStart() - o2.getUnitStart(); + } + return c; + } + }; + + private static final Comparator<ClonePart> RESOURCE_ID_COMPARATOR = new Comparator<ClonePart>() { + public int compare(ClonePart o1, ClonePart o2) { + return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); + } + }; + + /** + * Checks that second clone contains first one. + * <p> + * Clone A is contained in another clone B, if every part pA from A has part pB in B, + * which satisfy the conditions: + * <pre> + * (pA.resourceId == pB.resourceId) and (pB.unitStart <= pA.unitStart) and (pA.unitEnd <= pB.unitEnd) + * </pre> + * And all resourcesId from B exactly the same as all resourceId from A, which means that also every part pB from B has part pA in A, + * which satisfy the condition: + * <pre> + * pB.resourceId == pA.resourceId + * </pre> + * So this relation is: + * <ul> + * <li>reflexive - A in A</li> + * <li>transitive - (A in B) and (B in C) => (A in C)</li> + * <li>antisymmetric - (A in B) and (B in A) <=> (A = B)</li> + * </ul> + * </p> + * <p> + * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link #CLONEPART_COMPARATOR}), + * so running time - O(|A|+|B|). + * </p> + */ + private static boolean containsIn(CloneGroup first, CloneGroup second) { + // TODO Godin: must prove that this is unused + // if (!first.getOriginPart().getResourceId().equals(second.getOriginPart().getResourceId())) { + // return false; + // } + if (first.getCloneUnitLength() > second.getCloneUnitLength()) { + return false; + } + List<ClonePart> firstParts = first.getCloneParts(); + List<ClonePart> secondParts = second.getCloneParts(); + return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength())) + && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR); + } + + private static class ContainsInComparator implements Comparator<ClonePart> { + private final int l1, l2; + + public ContainsInComparator(int l1, int l2) { + this.l1 = l1; + this.l2 = l2; + } + + public int compare(ClonePart o1, ClonePart o2) { + int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); + if (c == 0) { + if (o2.getUnitStart() <= o1.getUnitStart()) { + if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) { + return 0; // match found - stop search + } else { + return 1; // continue search + } + } else { + return -1; // o1 < o2 by unitStart - stop search + } + } else { + return c; + } + } + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java new file mode 100644 index 00000000000..968fbd9e226 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java @@ -0,0 +1,79 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +public final class Edge { + + private int beginIndex; // can't be changed + private int endIndex; + private Node startNode; + private Node endNode; // can't be changed, could be used as edge id + + // each time edge is created, a new end node is created + public Edge(int beginIndex, int endIndex, Node startNode) { + this.beginIndex = beginIndex; + this.endIndex = endIndex; + this.startNode = startNode; + this.endNode = new Node(startNode, null); + } + + public Node splitEdge(Suffix suffix) { + remove(); + Edge newEdge = new Edge(beginIndex, beginIndex + suffix.getSpan(), suffix.getOriginNode()); + newEdge.insert(); + newEdge.endNode.setSuffixNode(suffix.getOriginNode()); + beginIndex += suffix.getSpan() + 1; + startNode = newEdge.getEndNode(); + insert(); + return newEdge.getEndNode(); + } + + public void insert() { + startNode.addEdge(beginIndex, this); + } + + public void remove() { + startNode.removeEdge(beginIndex); + } + + /** + * @return length of this edge in symbols + */ + public int getSpan() { + return endIndex - beginIndex; + } + + public int getBeginIndex() { + return beginIndex; + } + + public int getEndIndex() { + return endIndex; + } + + public Node getStartNode() { + return startNode; + } + + public Node getEndNode() { + return endNode; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java new file mode 100644 index 00000000000..202cb85084c --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java @@ -0,0 +1,77 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.List; + +import org.sonar.duplications.block.Block; + +import com.google.common.collect.Lists; + +public class GeneralisedHashText extends TextSet { + + public GeneralisedHashText(List<Block>... blocks) { + super(blocks.length); + + for (int i = 0; i < blocks.length; i++) { + addAll(blocks[i]); + addTerminator(); + } + finish(); + } + + private int count; + private List<Integer> sizes = Lists.newArrayList(); + + public void addBlock(Block block) { + symbols.add(block); + } + + public void addAll(List<Block> list) { + symbols.addAll(list); + } + + public void addTerminator() { + symbols.add(new Terminator(count)); + sizes.add(symbols.size()); + count++; + } + + public void finish() { + super.lens = new int[sizes.size()]; + for (int i = 0; i < sizes.size(); i++) { + super.lens[i] = sizes.get(i); + } + } + + @Override + public Object symbolAt(int index) { + Object obj = super.symbolAt(index); + if (obj instanceof Block) { + return ((Block) obj).getBlockHash(); + } + return obj; + } + + public Block getBlock(int index) { + return (Block) super.symbolAt(index); + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java new file mode 100644 index 00000000000..8fa39ecc832 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java @@ -0,0 +1,88 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.Map; +import java.util.HashMap; +import java.util.Collection; + +public final class Node { + + private final SuffixTree suffixTree; + private final Map<Object, Edge> edges; + + /** + * Node represents string s[i],s[i+1],...,s[j], + * suffix-link is a link to node, which represents string s[i+1],...,s[j]. + */ + private Node suffixNode; + + /** + * Number of symbols from the root to this node. + * <p> + * Note that this is not equal to number of nodes from root to this node, + * because in a compact suffix-tree edge can span multiple symbols - see {@link Edge#getSpan()}. + * </p><p> + * Depth of {@link #suffixNode} is always equal to this depth minus one. + * </p> + */ + int depth; + + int startSize, endSize; + + public Node(Node node, Node suffixNode) { + this(node.suffixTree, suffixNode); + } + + public Node(SuffixTree suffixTree, Node suffixNode) { + this.suffixTree = suffixTree; + this.suffixNode = suffixNode; + edges = new HashMap<Object, Edge>(); + } + + public Object symbolAt(int index) { + return suffixTree.symbolAt(index); + } + + public void addEdge(int charIndex, Edge edge) { + edges.put(symbolAt(charIndex), edge); + } + + public void removeEdge(int charIndex) { + edges.remove(symbolAt(charIndex)); + } + + public Edge findEdge(Object ch) { + return edges.get(ch); + } + + public Node getSuffixNode() { + return suffixNode; + } + + public void setSuffixNode(Node suffixNode) { + this.suffixNode = suffixNode; + } + + public Collection<Edge> getEdges() { + return edges.values(); + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java new file mode 100644 index 00000000000..f13410e90d4 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java @@ -0,0 +1,137 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.*; + +public final class Search { + + private final SuffixTree tree; + private final int[] lens; + private final Collector reporter; + + private final List<Integer> list = new ArrayList<Integer>(); + private final List<Node> innerNodes = new ArrayList<Node>(); + + public static void perform(TextSet text, Collector reporter) { + new Search(SuffixTree.create(text), text.lens, reporter).compute(); + } + + private Search(SuffixTree tree, int[] lens, Collector reporter) { + this.tree = tree; + this.lens = lens; + this.reporter = reporter; + } + + private void compute() { + // O(N) + computeDepth(); + + // O(N * log(N)) + Collections.sort(innerNodes, DEPTH_COMPARATOR); + + // O(N), recursive + createListOfLeafs(tree.getRootNode()); + + // O(N) + visitInnerNodes(); + } + + private void computeDepth() { + Queue<Node> queue = new LinkedList<Node>(); + queue.add(tree.getRootNode()); + tree.getRootNode().depth = 0; + while (!queue.isEmpty()) { + Node node = queue.remove(); + if (!node.getEdges().isEmpty()) { + if (node != tree.getRootNode()) { // inner node = not leaf and not root + innerNodes.add(node); + } + for (Edge edge : node.getEdges()) { + Node endNode = edge.getEndNode(); + endNode.depth = node.depth + edge.getSpan() + 1; + queue.add(endNode); + } + } + } + } + + private static final Comparator<Node> DEPTH_COMPARATOR = new Comparator<Node>() { + public int compare(Node o1, Node o2) { + return o2.depth - o1.depth; + } + }; + + private void createListOfLeafs(Node node) { + node.startSize = list.size(); + if (node.getEdges().isEmpty()) { // leaf + list.add(node.depth); + } else { + for (Edge edge : node.getEdges()) { + createListOfLeafs(edge.getEndNode()); + } + node.endSize = list.size(); + } + } + + /** + * Each inner-node represents prefix of some suffixes, thus substring of text. + */ + private void visitInnerNodes() { + for (Node node : innerNodes) { + if (containsOrigin(node)) { + report(node); + } + } + } + + private boolean containsOrigin(Node node) { + for (int i = node.startSize; i < node.endSize; i++) { + int start = tree.text.length() - list.get(i); + int end = start + node.depth; + if (end < lens[0]) { + return true; + } + } + return false; + } + + private void report(Node node) { + for (int i = node.startSize; i < node.endSize; i++) { + int start = tree.text.length() - list.get(i); + int end = start + node.depth; + reporter.part(start, end, node.depth); + } + reporter.endOfGroup(); + } + + public interface Collector { + + /** + * @param start start position in generalised text + * @param end end position in generalised text + */ + void part(int start, int end, int len); + + void endOfGroup(); + + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java new file mode 100644 index 00000000000..2f0165d7d41 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java @@ -0,0 +1,86 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +public final class Suffix { + + private Node originNode; + private int beginIndex; + private int endIndex; + + public Suffix(Node originNode, int beginIndex, int endIndex) { + this.originNode = originNode; + this.beginIndex = beginIndex; + this.endIndex = endIndex; + } + + public boolean isExplicit() { + return beginIndex > endIndex; + } + + public boolean isImplicit() { + return !isExplicit(); + } + + public void canonize() { + if (isImplicit()) { + Edge edge = originNode.findEdge(originNode.symbolAt(beginIndex)); + + int edgeSpan = edge.getSpan(); + while (edgeSpan <= getSpan()) { + beginIndex += edgeSpan + 1; + originNode = edge.getEndNode(); + if (beginIndex <= endIndex) { + edge = edge.getEndNode().findEdge(originNode.symbolAt(beginIndex)); + edgeSpan = edge.getSpan(); + } + } + } + } + + public int getSpan() { + return endIndex - beginIndex; + } + + public Node getOriginNode() { + return originNode; + } + + public int getBeginIndex() { + return beginIndex; + } + + public void incBeginIndex() { + beginIndex++; + } + + public void changeOriginNode() { + originNode = originNode.getSuffixNode(); + } + + public int getEndIndex() { + return endIndex; + } + + public void incEndIndex() { + endIndex++; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java new file mode 100644 index 00000000000..b6e21c940c7 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java @@ -0,0 +1,109 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import com.google.common.base.Objects; + +/** + * The implementation of the algorithm for constructing suffix-tree based on + * <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of + * <a href="http://marknelson.us/1996/08/01/suffix-trees/">Mark Nelson's C++ implementation of Ukkonen's algorithm</a>. + */ +public final class SuffixTree { + + final Text text; + + private final Node root; + + public static SuffixTree create(Text text) { + SuffixTree tree = new SuffixTree(text); + Suffix active = new Suffix(tree.root, 0, -1); + for (int i = 0; i < text.length(); i++) { + tree.addPrefix(active, i); + } + return tree; + } + + private SuffixTree(Text text) { + this.text = text; + root = new Node(this, null); + } + + private void addPrefix(Suffix active, int endIndex) { + Node lastParentNode = null; + Node parentNode; + + while (true) { + Edge edge; + parentNode = active.getOriginNode(); + + // Step 1 is to try and find a matching edge for the given node. + // If a matching edge exists, we are done adding edges, so we break out of this big loop. + if (active.isExplicit()) { + edge = active.getOriginNode().findEdge(symbolAt(endIndex)); + if (edge != null) { + break; + } + } else { + // implicit node, a little more complicated + edge = active.getOriginNode().findEdge(symbolAt(active.getBeginIndex())); + int span = active.getSpan(); + if (Objects.equal(symbolAt(edge.getBeginIndex() + span + 1), symbolAt(endIndex))) { + break; + } + parentNode = edge.splitEdge(active); + } + + // We didn't find a matching edge, so we create a new one, add it to the tree at the parent node position, + // and insert it into the hash table. When we create a new node, it also means we need to create + // a suffix link to the new node from the last node we visited. + Edge newEdge = new Edge(endIndex, text.length() - 1, parentNode); + newEdge.insert(); + updateSuffixNode(lastParentNode, parentNode); + lastParentNode = parentNode; + + // This final step is where we move to the next smaller suffix + if (active.getOriginNode() == root) { + active.incBeginIndex(); + } else { + active.changeOriginNode(); + } + active.canonize(); + } + updateSuffixNode(lastParentNode, parentNode); + active.incEndIndex(); // Now the endpoint is the next active point + active.canonize(); + } + + private void updateSuffixNode(Node node, Node suffixNode) { + if ((node != null) && (node != root)) { + node.setSuffixNode(suffixNode); + } + } + + public Object symbolAt(int index) { + return text.symbolAt(index); + } + + public Node getRootNode() { + return root; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java new file mode 100644 index 00000000000..0bd13a88cbf --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java @@ -0,0 +1,111 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.*; + +import org.sonar.duplications.block.Block; +import org.sonar.duplications.block.ByteArray; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.CloneIndex; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +public final class SuffixTreeCloneDetectionAlgorithm { + + public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) { + if (fileBlocks.isEmpty()) { + return Collections.EMPTY_LIST; + } + GeneralisedHashText text = retrieveFromIndex(cloneIndex, fileBlocks); + if (text == null) { + return Collections.EMPTY_LIST; + } + DuplicationsCollector reporter = new DuplicationsCollector(text); + Search.perform(text, reporter); + return reporter.getResult(); + } + + private SuffixTreeCloneDetectionAlgorithm() { + } + + private static GeneralisedHashText retrieveFromIndex(CloneIndex index, Collection<Block> fileBlocks) { + String originResourceId = fileBlocks.iterator().next().getResourceId(); + + Set<ByteArray> hashes = Sets.newHashSet(); + for (Block fileBlock : fileBlocks) { + hashes.add(fileBlock.getBlockHash()); + } + + Map<String, List<Block>> collection = Maps.newHashMap(); + for (ByteArray hash : hashes) { + Collection<Block> blocks = index.getBySequenceHash(hash); + for (Block blockFromIndex : blocks) { + // Godin: skip blocks for this file if they come from index + String resourceId = blockFromIndex.getResourceId(); + if (!originResourceId.equals(resourceId)) { + List<Block> list = collection.get(resourceId); + if (list == null) { + list = Lists.newArrayList(); + collection.put(resourceId, list); + } + list.add(blockFromIndex); + } + } + } + + if (collection.isEmpty() && hashes.size() == fileBlocks.size()) { // optimization for the case when there is no duplications + return null; + } + + GeneralisedHashText text = new GeneralisedHashText(); + List<Block> sortedFileBlocks = Lists.newArrayList(fileBlocks); + Collections.sort(sortedFileBlocks, BLOCK_COMPARATOR); + text.addAll(sortedFileBlocks); + text.addTerminator(); + + for (List<Block> list : collection.values()) { + Collections.sort(list, BLOCK_COMPARATOR); + + int i = 0; + while (i < list.size()) { + int j = i + 1; + while ((j < list.size()) && (list.get(j).getIndexInFile() == list.get(j - 1).getIndexInFile() + 1)) { + j++; + } + text.addAll(list.subList(i, j)); + text.addTerminator(); + i = j; + } + } + + text.finish(); + + return text; + } + + private static final Comparator<Block> BLOCK_COMPARATOR = new Comparator<Block>() { + public int compare(Block o1, Block o2) { + return o1.getIndexInFile() - o2.getIndexInFile(); + } + }; + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java new file mode 100644 index 00000000000..19b644d494a --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java @@ -0,0 +1,41 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.List; + +/** + * Represents text. + */ +public interface Text { + + /** + * @return length of the sequence of symbols represented by this object + */ + int length(); + + /** + * @return symbol at the specified index + */ + Object symbolAt(int index); + + List<Object> sequence(int fromIndex, int toIndex); + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java new file mode 100644 index 00000000000..aa3628ed048 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java @@ -0,0 +1,73 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +/** + * Simplifies construction of <a href="http://en.wikipedia.org/wiki/Generalised_suffix_tree">generalised suffix-tree</a>. + */ +public class TextSet extends AbstractText { + + int[] lens; + + public TextSet(int size) { + super(100); // FIXME + + lens = new int[size]; + } + + public TextSet(Text... text) { + this(text.length); + for (int i = 0; i < text.length; i++) { + symbols.addAll(text[i].sequence(0, text[i].length())); + symbols.add(new Terminator(i)); + lens[i] = symbols.size(); + } + } + + public static class Terminator { + + private final int stringNumber; + + public Terminator(int i) { + this.stringNumber = i; + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof Terminator) && (((Terminator) obj).stringNumber == stringNumber); + } + + @Override + public int hashCode() { + return stringNumber; + } + + public int getStringNumber() { + return stringNumber; + } + + @Override + public String toString() { + return "$" + stringNumber; + } + + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/CloneGroupMatcher.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/CloneGroupMatcher.java new file mode 100644 index 00000000000..ee4d105e490 --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/CloneGroupMatcher.java @@ -0,0 +1,81 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector; + +import org.hamcrest.Description; +import org.hamcrest.Matcher; +import org.hamcrest.Matchers; +import org.hamcrest.TypeSafeMatcher; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.ClonePart; + +public class CloneGroupMatcher extends TypeSafeMatcher<CloneGroup> { + + public static Matcher<Iterable<CloneGroup>> hasCloneGroup(int expectedLen, ClonePart... expectedParts) { + return Matchers.hasItem(new CloneGroupMatcher(expectedLen, expectedParts)); + } + + private final int expectedLen; + private final ClonePart[] expectedParts; + + private CloneGroupMatcher(int expectedLen, ClonePart... expectedParts) { + this.expectedLen = expectedLen; + this.expectedParts = expectedParts; + } + + public boolean matchesSafely(CloneGroup cloneGroup) { + // Check length + if (expectedLen != cloneGroup.getCloneUnitLength()) { + return false; + } + // Check number of parts + if (expectedParts.length != cloneGroup.getCloneParts().size()) { + return false; + } + // Check origin + if (!expectedParts[0].equals(cloneGroup.getOriginPart())) { + return false; + } + // Check parts + for (ClonePart expectedPart : expectedParts) { + boolean matched = false; + for (ClonePart part : cloneGroup.getCloneParts()) { + if (part.equals(expectedPart)) { + matched = true; + break; + } + } + if (!matched) { + return false; + } + } + return true; + } + + public void describeTo(Description description) { + StringBuilder builder = new StringBuilder(); + for (ClonePart part : expectedParts) { + builder.append(part).append(" - "); + } + builder.append(expectedLen); + description.appendText(builder.toString()); + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/DetectorTestCase.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/DetectorTestCase.java new file mode 100644 index 00000000000..3a461db1c5e --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/DetectorTestCase.java @@ -0,0 +1,439 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector; + +import static org.hamcrest.Matchers.hasItem; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.sameInstance; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThat; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoMoreInteractions; +import static org.sonar.duplications.detector.CloneGroupMatcher.hasCloneGroup; + +import java.util.*; + +import org.junit.Rule; +import org.junit.Test; +import org.sonar.duplications.block.Block; +import org.sonar.duplications.block.ByteArray; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.CloneIndex; +import org.sonar.duplications.index.ClonePart; +import org.sonar.duplications.index.MemoryCloneIndex; +import org.sonar.duplications.junit.TestNamePrinter; + +import com.google.common.collect.Lists; + +public abstract class DetectorTestCase { + + @Rule + public TestNamePrinter testNamePrinter = new TestNamePrinter(); + + protected static int LINES_PER_BLOCK = 5; + + /** + * To simplify testing we assume that each block starts from a new line and contains {@link #LINES_PER_BLOCK} lines, + * so we can simply use index and hash. + */ + protected static Block newBlock(String resourceId, ByteArray hash, int index) { + return new Block(resourceId, hash, index, index, index + LINES_PER_BLOCK); + } + + protected static ClonePart newClonePart(String resourceId, int unitStart, int cloneUnitLength) { + return new ClonePart(resourceId, unitStart, unitStart, unitStart + cloneUnitLength + LINES_PER_BLOCK - 1); + } + + protected abstract List<CloneGroup> detect(CloneIndex index, List<Block> fileBlocks); + + /** + * Given: + * <pre> + * y: 2 3 4 5 + * z: 3 4 + * x: 1 2 3 4 5 6 + * </pre> + * Expected: + * <pre> + * x-y (2 3 4 5) + * x-y-z (3 4) + * </pre> + */ + @Test + public void exampleFromPaper() { + CloneIndex index = createIndex( + newBlocks("y", "2 3 4 5"), + newBlocks("z", "3 4")); + List<Block> fileBlocks = newBlocks("x", "1 2 3 4 5 6"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertEquals(2, result.size()); + + assertThat(result, hasCloneGroup(4, + newClonePart("x", 1, 4), + newClonePart("y", 0, 4))); + + assertThat(result, hasCloneGroup(2, + newClonePart("x", 2, 2), + newClonePart("y", 1, 2), + newClonePart("z", 0, 2))); + } + + /** + * Given: + * <pre> + * a: 2 3 4 5 + * b: 3 4 + * c: 1 2 3 4 5 6 + * </pre> + * Expected: + * <pre> + * c-a (2 3 4 5) + * c-a-b (3 4) + * </pre> + */ + @Test + public void exampleFromPaperWithModifiedResourceIds() { + CloneIndex cloneIndex = createIndex( + newBlocks("a", "2 3 4 5"), + newBlocks("b", "3 4")); + List<Block> fileBlocks = newBlocks("c", "1 2 3 4 5 6"); + List<CloneGroup> clones = detect(cloneIndex, fileBlocks); + + print(clones); + assertThat(clones.size(), is(2)); + + assertThat(clones, hasCloneGroup(4, + newClonePart("c", 1, 4), + newClonePart("a", 0, 4))); + + assertThat(clones, hasCloneGroup(2, + newClonePart("c", 2, 2), + newClonePart("a", 1, 2), + newClonePart("b", 0, 2))); + } + + /** + * Given: + * <pre> + * b: 3 4 5 6 + * c: 5 6 7 + * a: 1 2 3 4 5 6 7 8 9 + * </pre> + * Expected: + * <pre> + * a-b (3 4 5 6) + * a-b-c (5 6) + * a-c (5 6 7) + * </pre> + */ + @Test + public void example1() { + CloneIndex index = createIndex( + newBlocks("b", "3 4 5 6"), + newBlocks("c", "5 6 7")); + List<Block> fileBlocks = newBlocks("a", "1 2 3 4 5 6 7 8 9"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertThat(result.size(), is(3)); + + assertThat(result, hasCloneGroup(4, + newClonePart("a", 2, 4), + newClonePart("b", 0, 4))); + + assertThat(result, hasCloneGroup(3, + newClonePart("a", 4, 3), + newClonePart("c", 0, 3))); + + assertThat(result, hasCloneGroup(2, + newClonePart("a", 4, 2), + newClonePart("b", 2, 2), + newClonePart("c", 0, 2))); + } + + /** + * Given: + * <pre> + * b: 1 2 3 4 1 2 3 4 1 2 3 4 + * c: 1 2 3 4 + * a: 1 2 3 5 + * </pre> + * Expected: + * <pre> + * a-b-b-b-c (1 2 3) + * </pre> + */ + @Test + public void example2() { + CloneIndex index = createIndex( + newBlocks("b", "1 2 3 4 1 2 3 4 1 2 3 4"), + newBlocks("c", "1 2 3 4")); + List<Block> fileBlocks = newBlocks("a", "1 2 3 5"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertThat(result.size(), is(1)); + + assertThat(result, hasCloneGroup(3, + newClonePart("a", 0, 3), + newClonePart("b", 0, 3), + newClonePart("b", 4, 3), + newClonePart("b", 8, 3), + newClonePart("c", 0, 3))); + } + + /** + * Test for problem, which was described in original paper - same clone would be reported twice. + * Given: + * <pre> + * a: 1 2 3 1 2 4 + * </pre> + * Expected only one clone: + * <pre> + * a-a (1 2) + * </pre> + */ + @Test + public void clonesInFileItself() { + CloneIndex index = createIndex(); + List<Block> fileBlocks = newBlocks("a", "1 2 3 1 2 4"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertThat(result.size(), is(1)); + + assertThat(result, hasCloneGroup(2, + newClonePart("a", 0, 2), + newClonePart("a", 3, 2))); + } + + /** + * Given: + * <pre> + * b: 1 2 1 2 + * a: 1 2 1 + * </pre> + * Expected: + * <pre> + * a-b-b (1 2) + * a-b (1 2 1) + * </pre> + * "a-a-b-b (1)" should not be reported, because fully covered by "a-b (1 2 1)" + */ + @Test + public void covered() { + CloneIndex index = createIndex( + newBlocks("b", "1 2 1 2")); + List<Block> fileBlocks = newBlocks("a", "1 2 1"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertThat(result.size(), is(2)); + + assertThat(result, hasCloneGroup(3, + newClonePart("a", 0, 3), + newClonePart("b", 0, 3))); + + assertThat(result, hasCloneGroup(2, + newClonePart("a", 0, 2), + newClonePart("b", 0, 2), + newClonePart("b", 2, 2))); + } + + /** + * Given: + * <pre> + * b: 1 2 1 2 1 2 1 + * a: 1 2 1 2 1 2 + * </pre> + * Expected: + * <pre> + * a-b-b (1 2 1 2 1) - note that there is overlapping among parts for "b" + * a-b (1 2 1 2 1 2) + * </pre> + */ + @Test + public void problemWithNestedCloneGroups() { + CloneIndex index = createIndex( + newBlocks("b", "1 2 1 2 1 2 1")); + List<Block> fileBlocks = newBlocks("a", "1 2 1 2 1 2"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertThat(result.size(), is(2)); + + assertThat(result, hasCloneGroup(6, + newClonePart("a", 0, 6), + newClonePart("b", 0, 6))); + + assertThat(result, hasCloneGroup(5, + newClonePart("a", 0, 5), + newClonePart("b", 0, 5), + newClonePart("b", 2, 5))); + } + + /** + * Given: + * <pre> + * a: 1 2 3 + * b: 1 2 4 + * a: 1 2 5 + * </pre> + * Expected: + * <pre> + * a-b (1 2) - instead of "a-a-b", which will be the case if file from index not ignored + * </pre> + */ + @Test + public void fileAlreadyInIndex() { + CloneIndex index = createIndex( + newBlocks("a", "1 2 3"), + newBlocks("b", "1 2 4")); + // Note about blocks with hashes "3", "4" and "5": those blocks here in order to not face another problem - with EOF (see separate test) + List<Block> fileBlocks = newBlocks("a", "1 2 5"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertThat(result.size(), is(1)); + + assertThat(result, hasCloneGroup(2, + newClonePart("a", 0, 2), + newClonePart("b", 0, 2))); + } + + /** + * Given: file with repeated hashes + * Expected: only one query of index for each unique hash + */ + @Test + public void only_one_query_of_index_for_each_unique_hash() { + CloneIndex index = spy(createIndex()); + List<Block> fileBlocks = newBlocks("a", "1 2 1 2"); + detect(index, fileBlocks); + + verify(index).getBySequenceHash(new ByteArray("01")); + verify(index).getBySequenceHash(new ByteArray("02")); + verifyNoMoreInteractions(index); + } + + /** + * Given: empty list of blocks for file + * Expected: {@link Collections#EMPTY_LIST} + */ + @Test + public void shouldReturnEmptyListWhenNoBlocksForFile() { + List<CloneGroup> result = detect(null, new ArrayList<Block>()); + assertThat(result, sameInstance(Collections.EMPTY_LIST)); + } + + /** + * Given: + * <pre> + * b: 1 2 3 4 + * a: 1 2 3 + * </pre> + * Expected clone which ends at the end of file "a": + * <pre> + * a-b (1 2 3) + * </pre> + */ + @Test + public void problemWithEndOfFile() { + CloneIndex cloneIndex = createIndex( + newBlocks("b", "1 2 3 4")); + List<Block> fileBlocks = + newBlocks("a", "1 2 3"); + List<CloneGroup> clones = detect(cloneIndex, fileBlocks); + + print(clones); + assertThat(clones.size(), is(1)); + + assertThat(clones, hasCloneGroup(3, + newClonePart("a", 0, 3), + newClonePart("b", 0, 3))); + } + + /** + * Given file with two lines, containing following statements: + * <pre> + * 0: A,B,A,B + * 1: A,B,A + * </pre> + * with block size 5 each block will span both lines, and hashes will be: + * <pre> + * A,B,A,B,A=1 + * B,A,B,A,B=2 + * A,B,A,B,A=1 + * </pre> + * Expected: one clone with two parts, which contain exactly the same lines + */ + @Test + public void same_lines_but_different_indexes() { + CloneIndex cloneIndex = createIndex(); + List<Block> fileBlocks = Arrays.asList( + new Block("a", new ByteArray("1".getBytes()), 0, 0, 1), + new Block("a", new ByteArray("2".getBytes()), 1, 0, 1), + new Block("a", new ByteArray("1".getBytes()), 2, 0, 1)); + List<CloneGroup> clones = detect(cloneIndex, fileBlocks); + + print(clones); + assertThat(clones.size(), is(1)); + Iterator<CloneGroup> clonesIterator = clones.iterator(); + + CloneGroup clone = clonesIterator.next(); + assertThat(clone.getCloneUnitLength(), is(1)); + assertThat(clone.getCloneParts().size(), is(2)); + assertThat(clone.getOriginPart(), is(new ClonePart("a", 0, 0, 1))); + assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 0, 0, 1))); + assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 2, 0, 1))); + } + + protected static void print(List<CloneGroup> clones) { + for (CloneGroup clone : clones) { + System.out.println(clone); + } + System.out.println(); + } + + protected static List<Block> newBlocks(String resourceId, String hashes) { + List<Block> result = Lists.newArrayList(); + int indexInFile = 0; + for (int i = 0; i < hashes.length(); i += 2) { + Block block = newBlock(resourceId, new ByteArray("0" + hashes.charAt(i)), indexInFile); + result.add(block); + indexInFile++; + } + return result; + } + + protected static CloneIndex createIndex(List<Block>... blocks) { + CloneIndex cloneIndex = new MemoryCloneIndex(); + for (List<Block> b : blocks) { + for (Block block : b) { + cloneIndex.insert(block); + } + } + return cloneIndex; + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java index e22faf753a9..54d3f87ab18 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithmTest.java @@ -19,494 +19,18 @@ */ package org.sonar.duplications.detector.original; -import static org.hamcrest.Matchers.hasItem; -import static org.hamcrest.Matchers.is; -import static org.hamcrest.Matchers.sameInstance; -import static org.junit.Assert.assertThat; -import static org.mockito.Mockito.spy; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.verifyNoMoreInteractions; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Iterator; import java.util.List; -import org.junit.Rule; -import org.junit.Test; import org.sonar.duplications.block.Block; -import org.sonar.duplications.block.ByteArray; +import org.sonar.duplications.detector.DetectorTestCase; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.CloneIndex; -import org.sonar.duplications.index.ClonePart; -import org.sonar.duplications.index.MemoryCloneIndex; -import org.sonar.duplications.junit.TestNamePrinter; - -import com.google.common.collect.Lists; - -public class OriginalCloneDetectionAlgorithmTest { - - @Rule - public TestNamePrinter name = new TestNamePrinter(); - - private static int LINES_PER_BLOCK = 5; - - /** - * To simplify testing we assume that each block starts from a new line and contains {@link #LINES_PER_BLOCK} lines, - * so we can simply use index and hash. - */ - private static Block newBlock(String resourceId, ByteArray hash, int index) { - return new Block(resourceId, hash, index, index, index + LINES_PER_BLOCK); - } - - private static ClonePart newClonePart(String resourceId, int unitStart, int cloneUnitLength) { - return new ClonePart(resourceId, unitStart, unitStart, unitStart + cloneUnitLength + LINES_PER_BLOCK - 1); - } - - /** - * Given: - * <pre> - * y: 2 3 4 5 - * z: 3 4 - * x: 1 2 3 4 5 6 - * </pre> - * Expected: - * <pre> - * x-y (2 3 4 5) - * x-y-z (3 4) - * </pre> - */ - @Test - public void exampleFromPaper() { - CloneIndex cloneIndex = createIndex( - blocksForResource("y").withHashes("2", "3", "4", "5"), - blocksForResource("z").withHashes("3", "4")); - List<Block> fileBlocks = blocksForResource("x").withHashes("1", "2", "3", "4", "5", "6"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - assertThat(clones.size(), is(2)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(4)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("x", 1, 4))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("x", 1, 4))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("y", 0, 4))); - - clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(2)); - assertThat(clone.getCloneParts().size(), is(3)); - assertThat(clone.getOriginPart(), is(newClonePart("x", 2, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("x", 2, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("y", 1, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("z", 0, 2))); - } - - /** - * Given: - * <pre> - * a: 2 3 4 5 - * b: 3 4 - * c: 1 2 3 4 5 6 - * </pre> - * Expected: - * <pre> - * c-a (2 3 4 5) - * c-a-b (3 4) - * </pre> - */ - @Test - public void exampleFromPaperWithModifiedResourceIds() { - CloneIndex cloneIndex = createIndex( - blocksForResource("a").withHashes("2", "3", "4", "5"), - blocksForResource("b").withHashes("3", "4")); - List<Block> fileBlocks = blocksForResource("c").withHashes("1", "2", "3", "4", "5", "6"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - assertThat(clones.size(), is(2)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(4)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("c", 1, 4))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 1, 4))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 4))); - - clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(2)); - assertThat(clone.getCloneParts().size(), is(3)); - assertThat(clone.getOriginPart(), is(newClonePart("c", 2, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 2, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 1, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 2))); - } - - /** - * Given: - * <pre> - * b: 3 4 5 6 - * c: 5 6 7 - * a: 1 2 3 4 5 6 7 8 9 - * </pre> - * Expected: - * <pre> - * a-b (3 4 5 6) - * a-b-c (5 6) - * a-c (5 6 7) - * </pre> - */ - @Test - public void example1() { - CloneIndex cloneIndex = createIndex( - blocksForResource("b").withHashes("3", "4", "5", "6"), - blocksForResource("c").withHashes("5", "6", "7")); - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "3", "4", "5", "6", "7", "8", "9"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - assertThat(clones.size(), is(3)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(4)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 2, 4))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 2, 4))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 4))); - - clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(2)); - assertThat(clone.getCloneParts().size(), is(3)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 4, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 4, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 2, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 0, 2))); - - clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(3)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 4, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 4, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 0, 3))); - } - - /** - * Given: - * <pre> - * b: 1 2 3 4 1 2 3 4 1 2 3 4 - * c: 1 2 3 4 - * a: 1 2 3 4 5 - * </pre> - * Expected: - * <pre> - * a-b-b-b-c (1 2 3 4) - * </pre> - */ - @Test - public void example2() { - CloneIndex cloneIndex = createIndex( - blocksForResource("b").withHashes("1", "2", "3", "4", "1", "2", "3", "4", "1", "2", "3", "4"), - blocksForResource("c").withHashes("1", "2", "3", "4")); - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "3", "5"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - assertThat(clones.size(), is(1)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(3)); - assertThat(clone.getCloneParts().size(), is(5)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 4, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 8, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("c", 0, 3))); - } - - /** - * Given: - * <pre> - * b: 1 2 3 4 - * a: 1 2 3 - * </pre> - * Expected clone which ends at the end of file "a": - * <pre> - * a-b (1 2 3) - * </pre> - */ - @Test - public void problemWithEndOfFile() { - CloneIndex cloneIndex = createIndex( - blocksForResource("b").withHashes("1", "2", "3", "4")); - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "3"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - assertThat(clones.size(), is(1)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(3)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 3))); - } - - /** - * Test for problem, which was described in original paper - same clone would be reported twice. - * Given: - * <pre> - * a: 1 2 3 1 2 4 - * </pre> - * Expected only one clone: - * <pre> - * a-a (1 2) - * </pre> - */ - @Test - public void clonesInFileItself() { - CloneIndex cloneIndex = createIndex(); - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "3", "1", "2", "4"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - - assertThat(clones.size(), is(1)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(2)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 3, 2))); - } - - /** - * Given: - * <pre> - * b: 1 2 1 2 - * a: 1 2 1 - * </pre> - * Expected: - * <pre> - * a-b-b (1 2) - * a-b (1 2 1) - * </pre> - * "a-a-b-b (1)" should not be reported, because fully covered by "a-b (1 2 1)" - */ - @Test - public void covered() { - CloneIndex cloneIndex = createIndex( - blocksForResource("b").withHashes("1", "2", "1", "2")); - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "1"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - - assertThat(clones.size(), is(2)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 2, 2))); - - clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(3)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 3))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 3))); - } - - /** - * Given: - * <pre> - * b: 1 2 1 2 1 2 1 - * a: 1 2 1 2 1 2 - * </pre> - * Expected: - * <pre> - * a-b-b (1 2 1 2 1) - note that there is overlapping among parts for "b" - * a-b (1 2 1 2 1 2) - * </pre> - */ - @Test - public void problemWithNestedCloneGroups() { - CloneIndex cloneIndex = createIndex( - blocksForResource("b").withHashes("1", "2", "1", "2", "1", "2", "1")); - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "1", "2", "1", "2"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - - assertThat(clones.size(), is(2)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(5)); - assertThat(clone.getCloneParts().size(), is(3)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 5))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 5))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 5))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 2, 5))); - - clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(6)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 6))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 6))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 6))); - } - - /** - * Given: - * <pre> - * a: 1 2 3 - * b: 1 2 4 - * a: 1 2 5 - * </pre> - * Expected: - * <pre> - * a-b (1 2) - instead of "a-a-b", which will be the case if file from index not ignored - * </pre> - */ - @Test - public void fileAlreadyInIndex() { - CloneIndex cloneIndex = createIndex( - blocksForResource("a").withHashes("1", "2", "3"), - blocksForResource("b").withHashes("1", "2", "4")); - // Note about blocks with hashes "3", "4" and "5": those blocks here in order to not face another problem - with EOF (see separate test) - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "5"); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - - assertThat(clones.size(), is(1)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(2)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(newClonePart("a", 0, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("a", 0, 2))); - assertThat(clone.getCloneParts(), hasItem(newClonePart("b", 0, 2))); - } - - /** - * Given: file with repeated hashes - * Expected: only one query of index for each unique hash - */ - @Test - public void only_one_query_of_index_for_each_unique_hash() { - CloneIndex index = spy(createIndex()); - List<Block> fileBlocks = - blocksForResource("a").withHashes("1", "2", "1", "2"); - OriginalCloneDetectionAlgorithm.detect(index, fileBlocks); - - verify(index).getBySequenceHash(new ByteArray("1".getBytes())); - verify(index).getBySequenceHash(new ByteArray("2".getBytes())); - verifyNoMoreInteractions(index); - } - - /** - * Given file with two lines, containing following statements: - * <pre> - * 0: A,B,A,B - * 1: A,B,A - * </pre> - * with block size 5 each block will span both lines, and hashes will be: - * <pre> - * A,B,A,B,A=1 - * B,A,B,A,B=2 - * A,B,A,B,A=1 - * </pre> - * Expected: one clone with two parts, which contain exactly the same lines - */ - @Test - public void same_lines_but_different_indexes() { - CloneIndex cloneIndex = createIndex(); - List<Block> fileBlocks = Arrays.asList( - new Block("a", new ByteArray("1".getBytes()), 0, 0, 1), - new Block("a", new ByteArray("2".getBytes()), 1, 0, 1), - new Block("a", new ByteArray("1".getBytes()), 2, 0, 1)); - List<CloneGroup> clones = OriginalCloneDetectionAlgorithm.detect(cloneIndex, fileBlocks); - print(clones); - - assertThat(clones.size(), is(1)); - Iterator<CloneGroup> clonesIterator = clones.iterator(); - - CloneGroup clone = clonesIterator.next(); - assertThat(clone.getCloneUnitLength(), is(1)); - assertThat(clone.getCloneParts().size(), is(2)); - assertThat(clone.getOriginPart(), is(new ClonePart("a", 0, 0, 1))); - assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 0, 0, 1))); - assertThat(clone.getCloneParts(), hasItem(new ClonePart("a", 2, 0, 1))); - } - - /** - * Given: empty list of blocks for file - * Expected: {@link Collections#EMPTY_LIST} - */ - @Test - public void shouldReturnEmptyListWhenNoBlocksForFile() { - List<CloneGroup> result = OriginalCloneDetectionAlgorithm.detect(null, new ArrayList<Block>()); - assertThat(result, sameInstance(Collections.EMPTY_LIST)); - } - - private void print(List<CloneGroup> clones) { - for (CloneGroup clone : clones) { - System.out.println(clone); - } - System.out.println(); - } - - private static CloneIndex createIndex(List<Block>... blocks) { - CloneIndex cloneIndex = new MemoryCloneIndex(); - for (List<Block> b : blocks) { - for (Block block : b) { - cloneIndex.insert(block); - } - } - return cloneIndex; - } - - private static BlocksBuilder blocksForResource(String resourceId) { - return new BlocksBuilder(resourceId); - } - - private static class BlocksBuilder { - String resourceId; - - public BlocksBuilder(String resourceId) { - this.resourceId = resourceId; - } - List<Block> withHashes(String... hashes) { - ByteArray[] arrays = new ByteArray[hashes.length]; - for (int i = 0; i < hashes.length; i++) { - arrays[i] = new ByteArray(hashes[i].getBytes()); - } - return withHashes(arrays); - } +public class OriginalCloneDetectionAlgorithmTest extends DetectorTestCase { - List<Block> withHashes(ByteArray... hashes) { - List<Block> result = Lists.newArrayList(); - int index = 0; - for (ByteArray hash : hashes) { - result.add(newBlock(resourceId, hash, index)); - index++; - } - return result; - } + @Override + protected List<CloneGroup> detect(CloneIndex index, List<Block> fileBlocks) { + return OriginalCloneDetectionAlgorithm.detect(index, fileBlocks); } } diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java new file mode 100644 index 00000000000..ab4b0b9cd8f --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java @@ -0,0 +1,47 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import org.sonar.duplications.detector.suffixtree.Search; +import org.sonar.duplications.detector.suffixtree.TextSet; + +public final class PrintCollector implements Search.Collector { + + private final TextSet text; + private int groups; + + public PrintCollector(TextSet text) { + this.text = text; + } + + public void part(int start, int end, int len) { + System.out.println(start + " " + end + " : " + text.sequence(start, end)); + } + + public void endOfGroup() { + groups++; + System.out.println(); + } + + public int getGroups() { + return groups; + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java new file mode 100644 index 00000000000..80daa78225b --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java @@ -0,0 +1,96 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import org.sonar.duplications.detector.suffixtree.Edge; +import org.sonar.duplications.detector.suffixtree.Node; +import org.sonar.duplications.detector.suffixtree.SuffixTree; +import org.sonar.duplications.detector.suffixtree.Text; + +import com.google.common.base.Objects; + +public class StringSuffixTree { + + private final SuffixTree suffixTree; + + public static StringSuffixTree create(String text) { + return new StringSuffixTree(text); + } + + private StringSuffixTree(String text) { + suffixTree = SuffixTree.create(new StringText(text)); + } + + public int indexOf(String str) { + return indexOf(suffixTree, new StringText(str)); + } + + public boolean contains(String str) { + return contains(suffixTree, new StringText(str)); + } + + public SuffixTree getSuffixTree() { + return suffixTree; + } + + public static boolean contains(SuffixTree tree, Text str) { + return indexOf(tree, str) >= 0; + } + + public static int indexOf(SuffixTree tree, Text str) { + if (str.length() == 0) { + return -1; + } + + int index = -1; + Node node = tree.getRootNode(); + + int i = 0; + while (i < str.length()) { + if (node == null) { + return -1; + } + if (i == tree.text.length()) { + return -1; + } + + Edge edge = node.findEdge(str.symbolAt(i)); + if (edge == null) { + return -1; + } + + index = edge.getBeginIndex() - i; + i++; + + for (int j = edge.getBeginIndex() + 1; j <= edge.getEndIndex(); j++) { + if (i == str.length()) { + break; + } + if (!Objects.equal(tree.symbolAt(j), str.symbolAt(i))) { + return -1; + } + i++; + } + node = edge.getEndNode(); + } + return index; + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java new file mode 100644 index 00000000000..7cd796c755f --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java @@ -0,0 +1,106 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +public class StringSuffixTreeTest { + + @Test + public void testMississippi() { + StringSuffixTree st = StringSuffixTree.create("mississippi"); + + assertTrue(st.contains("miss")); + assertTrue(st.contains("missis")); + assertTrue(st.contains("pi")); + } + + @Test + public void testBanana() { + StringSuffixTree st = StringSuffixTree.create("banana"); + + assertTrue(st.contains("ana")); + assertTrue(st.contains("an")); + assertTrue(st.contains("na")); + } + + @Test + public void testBook() { + StringSuffixTree st = StringSuffixTree.create("book"); + + assertTrue(st.contains("book")); + assertTrue(st.contains("oo")); + assertTrue(st.contains("ok")); + assertFalse(st.contains("okk")); + assertFalse(st.contains("bookk")); + assertFalse(st.contains("bok")); + + assertEquals(0, st.indexOf("book")); + assertEquals(1, st.indexOf("oo")); + assertEquals(2, st.indexOf("ok")); + } + + @Test + public void testBookke() { + StringSuffixTree st = StringSuffixTree.create("bookke"); + + assertTrue(st.contains("bookk")); + + assertEquals(0, st.indexOf("book")); + assertEquals(1, st.indexOf("oo")); + assertEquals(2, st.indexOf("ok")); + } + + @Test + public void testCacao() { + StringSuffixTree st = StringSuffixTree.create("cacao"); + + assertTrue(st.contains("aca")); + + assertEquals(3, st.indexOf("ao")); + assertEquals(0, st.indexOf("ca")); + assertEquals(2, st.indexOf("cao")); + } + + @Test + public void testGoogol() { + StringSuffixTree st = StringSuffixTree.create("googol"); + + assertTrue(st.contains("oo")); + + assertEquals(0, st.indexOf("go")); + assertEquals(3, st.indexOf("gol")); + assertEquals(1, st.indexOf("oo")); + } + + @Test + public void testAbababc() { + StringSuffixTree st = StringSuffixTree.create("abababc"); + + assertTrue(st.contains("aba")); + + assertEquals(0, st.indexOf("aba")); + assertEquals(4, st.indexOf("abc")); + } +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringText.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringText.java new file mode 100644 index 00000000000..7828f03494e --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringText.java @@ -0,0 +1,37 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import org.sonar.duplications.detector.suffixtree.AbstractText; +import org.sonar.duplications.detector.suffixtree.Text; + +/** + * Implementation of {@link Text} based on {@link String}. + */ +public class StringText extends AbstractText { + + public StringText(String text) { + super(text.length()); + for (int i = 0; i < text.length(); i++) { + symbols.add(Character.valueOf(text.charAt(i))); + } + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java new file mode 100644 index 00000000000..572811ef3df --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java @@ -0,0 +1,120 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import static org.hamcrest.Matchers.sameInstance; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThat; +import static org.sonar.duplications.detector.CloneGroupMatcher.hasCloneGroup; + +import java.util.Collections; +import java.util.List; + +import org.junit.Test; +import org.sonar.duplications.block.Block; +import org.sonar.duplications.detector.DetectorTestCase; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.CloneIndex; + +public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { + + /** + * Given: file without duplications + * Expected: {@link Collections#EMPTY_LIST} (no need to construct suffix-tree) + */ + @Test + public void noDuplications() { + CloneIndex index = createIndex(); + List<Block> fileBlocks = newBlocks("a", "1 2 3"); + List<CloneGroup> result = detect(index, fileBlocks); + assertThat(result, sameInstance(Collections.EMPTY_LIST)); + } + + /** + * Given: + * <pre> + * x: a 2 b 2 c 2 2 2 + * </pre> + * Expected: + * <pre> + * x-x (2 2) + * x-x-x-x-x (2) + * <pre> + */ + @Test + public void myTest() { + CloneIndex index = createIndex(); + List<Block> fileBlocks = newBlocks("x", "a 2 b 2 c 2 2 2"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertEquals(2, result.size()); + + assertThat(result, hasCloneGroup(2, + newClonePart("x", 5, 2), + newClonePart("x", 6, 2))); + + assertThat(result, hasCloneGroup(1, + newClonePart("x", 1, 1), + newClonePart("x", 3, 1), + newClonePart("x", 5, 1), + newClonePart("x", 6, 1), + newClonePart("x", 7, 1))); + } + + /** + * Given: + * <pre> + * x: a 2 3 b 2 3 c 2 3 d 2 3 2 3 2 3 + * </pre> + * Expected: + * <pre> + * x-x (2 3 2 3) + * x-x-x-x-x-x (2 3) + * <pre> + */ + @Test + public void myTest2() { + CloneIndex index = createIndex(); + List<Block> fileBlocks = newBlocks("x", "a 2 3 b 2 3 c 2 3 d 2 3 2 3 2 3"); + List<CloneGroup> result = detect(index, fileBlocks); + + print(result); + assertEquals(2, result.size()); + + assertThat(result, hasCloneGroup(4, + newClonePart("x", 10, 4), + newClonePart("x", 12, 4))); + + assertThat(result, hasCloneGroup(2, + newClonePart("x", 1, 2), + newClonePart("x", 4, 2), + newClonePart("x", 7, 2), + newClonePart("x", 10, 2), + newClonePart("x", 12, 2), + newClonePart("x", 14, 2))); + } + + @Override + protected List<CloneGroup> detect(CloneIndex index, List<Block> fileBlocks) { + return SuffixTreeCloneDetectionAlgorithm.detect(index, fileBlocks); + } + +} |