diff options
author | Evgeny Mandrikov <mandrikov@gmail.com> | 2011-12-07 02:37:47 +0400 |
---|---|---|
committer | Evgeny Mandrikov <mandrikov@gmail.com> | 2011-12-07 03:31:04 +0400 |
commit | f0d96c1053f6345e82351522c5bd8ce5633008bd (patch) | |
tree | b984e5f755b82ee21073b7ab967d314b22a34fe4 /sonar-duplications/src/main/java/org/sonar | |
parent | 9ae42f76ea1c21b43e3ea6ec044fe45276be9875 (diff) | |
download | sonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.tar.gz sonarqube-f0d96c1053f6345e82351522c5bd8ce5633008bd.zip |
SONAR-3060 Add new CPD algorithm based on suffix tree
Diffstat (limited to 'sonar-duplications/src/main/java/org/sonar')
11 files changed, 1046 insertions, 0 deletions
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java new file mode 100644 index 00000000000..5a3ffa0c14c --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/AbstractText.java @@ -0,0 +1,45 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.ArrayList; +import java.util.List; + +public abstract class AbstractText implements Text { + + protected final List<Object> symbols; + + public AbstractText(int size) { + this.symbols = new ArrayList<Object>(size); + } + + public int length() { + return symbols.size(); + } + + public Object symbolAt(int index) { + return symbols.get(index); + } + + public List<Object> sequence(int fromIndex, int toIndex) { + return symbols.subList(fromIndex, toIndex); + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java new file mode 100644 index 00000000000..745fb90a3b3 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java @@ -0,0 +1,200 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; + +import org.sonar.duplications.block.Block; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.ClonePart; +import org.sonar.duplications.utils.FastStringComparator; +import org.sonar.duplications.utils.SortedListsUtils; + +import com.google.common.collect.Lists; + +/** + * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s. + */ +public class DuplicationsCollector implements Search.Collector { + + private final GeneralisedHashText text; + private final String originResourceId; + + /** + * Note that LinkedList should provide better performance here, because of use of operation remove. + * + * @see #filter(CloneGroup) + */ + private final List<CloneGroup> filtered = Lists.newLinkedList(); + + private int length; + private ClonePart origin; + private final List<ClonePart> parts = Lists.newArrayList(); + + public DuplicationsCollector(GeneralisedHashText text) { + this.text = text; + this.originResourceId = text.getBlock(0).getResourceId(); + } + + /** + * @return current result + */ + public List<CloneGroup> getResult() { + return filtered; + } + + public void part(int start, int end, int len) { + length = len; + Block firstBlock = text.getBlock(start); + Block lastBlock = text.getBlock(end - 1); + + ClonePart part = new ClonePart( + firstBlock.getResourceId(), + firstBlock.getIndexInFile(), + firstBlock.getFirstLineNumber(), + lastBlock.getLastLineNumber()); + + // TODO Godin: maybe use FastStringComparator here ? + if (originResourceId.equals(part.getResourceId())) { // part from origin + if (origin == null) { + origin = part; + } else if (part.getUnitStart() < origin.getUnitStart()) { + origin = part; + } + } + + parts.add(part); + } + + public void endOfGroup() { + Collections.sort(parts, CLONEPART_COMPARATOR); + CloneGroup group = new CloneGroup(length, origin, parts); + filter(group); + + parts.clear(); + origin = null; + } + + private void filter(CloneGroup current) { + Iterator<CloneGroup> i = filtered.iterator(); + while (i.hasNext()) { + CloneGroup earlier = i.next(); + // Note that following two conditions cannot be true together - proof by contradiction: + // let C be the current clone and A and B were found earlier + // then since relation is transitive - (A in C) and (C in B) => (A in B) + // so A should be filtered earlier + if (containsIn(current, earlier)) { + // current clone fully covered by clone, which was found earlier + return; + } + // TODO Godin: must prove that this is unused + // if (containsIn(earlier, current)) { + // // current clone fully covers clone, which was found earlier + // i.remove(); + // } + } + filtered.add(current); + } + + private static final Comparator<ClonePart> CLONEPART_COMPARATOR = new Comparator<ClonePart>() { + public int compare(ClonePart o1, ClonePart o2) { + int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); + if (c == 0) { + return o1.getUnitStart() - o2.getUnitStart(); + } + return c; + } + }; + + private static final Comparator<ClonePart> RESOURCE_ID_COMPARATOR = new Comparator<ClonePart>() { + public int compare(ClonePart o1, ClonePart o2) { + return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); + } + }; + + /** + * Checks that second clone contains first one. + * <p> + * Clone A is contained in another clone B, if every part pA from A has part pB in B, + * which satisfy the conditions: + * <pre> + * (pA.resourceId == pB.resourceId) and (pB.unitStart <= pA.unitStart) and (pA.unitEnd <= pB.unitEnd) + * </pre> + * And all resourcesId from B exactly the same as all resourceId from A, which means that also every part pB from B has part pA in A, + * which satisfy the condition: + * <pre> + * pB.resourceId == pA.resourceId + * </pre> + * So this relation is: + * <ul> + * <li>reflexive - A in A</li> + * <li>transitive - (A in B) and (B in C) => (A in C)</li> + * <li>antisymmetric - (A in B) and (B in A) <=> (A = B)</li> + * </ul> + * </p> + * <p> + * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link #CLONEPART_COMPARATOR}), + * so running time - O(|A|+|B|). + * </p> + */ + private static boolean containsIn(CloneGroup first, CloneGroup second) { + // TODO Godin: must prove that this is unused + // if (!first.getOriginPart().getResourceId().equals(second.getOriginPart().getResourceId())) { + // return false; + // } + if (first.getCloneUnitLength() > second.getCloneUnitLength()) { + return false; + } + List<ClonePart> firstParts = first.getCloneParts(); + List<ClonePart> secondParts = second.getCloneParts(); + return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength())) + && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR); + } + + private static class ContainsInComparator implements Comparator<ClonePart> { + private final int l1, l2; + + public ContainsInComparator(int l1, int l2) { + this.l1 = l1; + this.l2 = l2; + } + + public int compare(ClonePart o1, ClonePart o2) { + int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); + if (c == 0) { + if (o2.getUnitStart() <= o1.getUnitStart()) { + if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) { + return 0; // match found - stop search + } else { + return 1; // continue search + } + } else { + return -1; // o1 < o2 by unitStart - stop search + } + } else { + return c; + } + } + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java new file mode 100644 index 00000000000..968fbd9e226 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Edge.java @@ -0,0 +1,79 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +public final class Edge { + + private int beginIndex; // can't be changed + private int endIndex; + private Node startNode; + private Node endNode; // can't be changed, could be used as edge id + + // each time edge is created, a new end node is created + public Edge(int beginIndex, int endIndex, Node startNode) { + this.beginIndex = beginIndex; + this.endIndex = endIndex; + this.startNode = startNode; + this.endNode = new Node(startNode, null); + } + + public Node splitEdge(Suffix suffix) { + remove(); + Edge newEdge = new Edge(beginIndex, beginIndex + suffix.getSpan(), suffix.getOriginNode()); + newEdge.insert(); + newEdge.endNode.setSuffixNode(suffix.getOriginNode()); + beginIndex += suffix.getSpan() + 1; + startNode = newEdge.getEndNode(); + insert(); + return newEdge.getEndNode(); + } + + public void insert() { + startNode.addEdge(beginIndex, this); + } + + public void remove() { + startNode.removeEdge(beginIndex); + } + + /** + * @return length of this edge in symbols + */ + public int getSpan() { + return endIndex - beginIndex; + } + + public int getBeginIndex() { + return beginIndex; + } + + public int getEndIndex() { + return endIndex; + } + + public Node getStartNode() { + return startNode; + } + + public Node getEndNode() { + return endNode; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java new file mode 100644 index 00000000000..202cb85084c --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/GeneralisedHashText.java @@ -0,0 +1,77 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.List; + +import org.sonar.duplications.block.Block; + +import com.google.common.collect.Lists; + +public class GeneralisedHashText extends TextSet { + + public GeneralisedHashText(List<Block>... blocks) { + super(blocks.length); + + for (int i = 0; i < blocks.length; i++) { + addAll(blocks[i]); + addTerminator(); + } + finish(); + } + + private int count; + private List<Integer> sizes = Lists.newArrayList(); + + public void addBlock(Block block) { + symbols.add(block); + } + + public void addAll(List<Block> list) { + symbols.addAll(list); + } + + public void addTerminator() { + symbols.add(new Terminator(count)); + sizes.add(symbols.size()); + count++; + } + + public void finish() { + super.lens = new int[sizes.size()]; + for (int i = 0; i < sizes.size(); i++) { + super.lens[i] = sizes.get(i); + } + } + + @Override + public Object symbolAt(int index) { + Object obj = super.symbolAt(index); + if (obj instanceof Block) { + return ((Block) obj).getBlockHash(); + } + return obj; + } + + public Block getBlock(int index) { + return (Block) super.symbolAt(index); + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java new file mode 100644 index 00000000000..8fa39ecc832 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Node.java @@ -0,0 +1,88 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.Map; +import java.util.HashMap; +import java.util.Collection; + +public final class Node { + + private final SuffixTree suffixTree; + private final Map<Object, Edge> edges; + + /** + * Node represents string s[i],s[i+1],...,s[j], + * suffix-link is a link to node, which represents string s[i+1],...,s[j]. + */ + private Node suffixNode; + + /** + * Number of symbols from the root to this node. + * <p> + * Note that this is not equal to number of nodes from root to this node, + * because in a compact suffix-tree edge can span multiple symbols - see {@link Edge#getSpan()}. + * </p><p> + * Depth of {@link #suffixNode} is always equal to this depth minus one. + * </p> + */ + int depth; + + int startSize, endSize; + + public Node(Node node, Node suffixNode) { + this(node.suffixTree, suffixNode); + } + + public Node(SuffixTree suffixTree, Node suffixNode) { + this.suffixTree = suffixTree; + this.suffixNode = suffixNode; + edges = new HashMap<Object, Edge>(); + } + + public Object symbolAt(int index) { + return suffixTree.symbolAt(index); + } + + public void addEdge(int charIndex, Edge edge) { + edges.put(symbolAt(charIndex), edge); + } + + public void removeEdge(int charIndex) { + edges.remove(symbolAt(charIndex)); + } + + public Edge findEdge(Object ch) { + return edges.get(ch); + } + + public Node getSuffixNode() { + return suffixNode; + } + + public void setSuffixNode(Node suffixNode) { + this.suffixNode = suffixNode; + } + + public Collection<Edge> getEdges() { + return edges.values(); + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java new file mode 100644 index 00000000000..f13410e90d4 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java @@ -0,0 +1,137 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.*; + +public final class Search { + + private final SuffixTree tree; + private final int[] lens; + private final Collector reporter; + + private final List<Integer> list = new ArrayList<Integer>(); + private final List<Node> innerNodes = new ArrayList<Node>(); + + public static void perform(TextSet text, Collector reporter) { + new Search(SuffixTree.create(text), text.lens, reporter).compute(); + } + + private Search(SuffixTree tree, int[] lens, Collector reporter) { + this.tree = tree; + this.lens = lens; + this.reporter = reporter; + } + + private void compute() { + // O(N) + computeDepth(); + + // O(N * log(N)) + Collections.sort(innerNodes, DEPTH_COMPARATOR); + + // O(N), recursive + createListOfLeafs(tree.getRootNode()); + + // O(N) + visitInnerNodes(); + } + + private void computeDepth() { + Queue<Node> queue = new LinkedList<Node>(); + queue.add(tree.getRootNode()); + tree.getRootNode().depth = 0; + while (!queue.isEmpty()) { + Node node = queue.remove(); + if (!node.getEdges().isEmpty()) { + if (node != tree.getRootNode()) { // inner node = not leaf and not root + innerNodes.add(node); + } + for (Edge edge : node.getEdges()) { + Node endNode = edge.getEndNode(); + endNode.depth = node.depth + edge.getSpan() + 1; + queue.add(endNode); + } + } + } + } + + private static final Comparator<Node> DEPTH_COMPARATOR = new Comparator<Node>() { + public int compare(Node o1, Node o2) { + return o2.depth - o1.depth; + } + }; + + private void createListOfLeafs(Node node) { + node.startSize = list.size(); + if (node.getEdges().isEmpty()) { // leaf + list.add(node.depth); + } else { + for (Edge edge : node.getEdges()) { + createListOfLeafs(edge.getEndNode()); + } + node.endSize = list.size(); + } + } + + /** + * Each inner-node represents prefix of some suffixes, thus substring of text. + */ + private void visitInnerNodes() { + for (Node node : innerNodes) { + if (containsOrigin(node)) { + report(node); + } + } + } + + private boolean containsOrigin(Node node) { + for (int i = node.startSize; i < node.endSize; i++) { + int start = tree.text.length() - list.get(i); + int end = start + node.depth; + if (end < lens[0]) { + return true; + } + } + return false; + } + + private void report(Node node) { + for (int i = node.startSize; i < node.endSize; i++) { + int start = tree.text.length() - list.get(i); + int end = start + node.depth; + reporter.part(start, end, node.depth); + } + reporter.endOfGroup(); + } + + public interface Collector { + + /** + * @param start start position in generalised text + * @param end end position in generalised text + */ + void part(int start, int end, int len); + + void endOfGroup(); + + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java new file mode 100644 index 00000000000..2f0165d7d41 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Suffix.java @@ -0,0 +1,86 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +public final class Suffix { + + private Node originNode; + private int beginIndex; + private int endIndex; + + public Suffix(Node originNode, int beginIndex, int endIndex) { + this.originNode = originNode; + this.beginIndex = beginIndex; + this.endIndex = endIndex; + } + + public boolean isExplicit() { + return beginIndex > endIndex; + } + + public boolean isImplicit() { + return !isExplicit(); + } + + public void canonize() { + if (isImplicit()) { + Edge edge = originNode.findEdge(originNode.symbolAt(beginIndex)); + + int edgeSpan = edge.getSpan(); + while (edgeSpan <= getSpan()) { + beginIndex += edgeSpan + 1; + originNode = edge.getEndNode(); + if (beginIndex <= endIndex) { + edge = edge.getEndNode().findEdge(originNode.symbolAt(beginIndex)); + edgeSpan = edge.getSpan(); + } + } + } + } + + public int getSpan() { + return endIndex - beginIndex; + } + + public Node getOriginNode() { + return originNode; + } + + public int getBeginIndex() { + return beginIndex; + } + + public void incBeginIndex() { + beginIndex++; + } + + public void changeOriginNode() { + originNode = originNode.getSuffixNode(); + } + + public int getEndIndex() { + return endIndex; + } + + public void incEndIndex() { + endIndex++; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java new file mode 100644 index 00000000000..b6e21c940c7 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java @@ -0,0 +1,109 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import com.google.common.base.Objects; + +/** + * The implementation of the algorithm for constructing suffix-tree based on + * <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of + * <a href="http://marknelson.us/1996/08/01/suffix-trees/">Mark Nelson's C++ implementation of Ukkonen's algorithm</a>. + */ +public final class SuffixTree { + + final Text text; + + private final Node root; + + public static SuffixTree create(Text text) { + SuffixTree tree = new SuffixTree(text); + Suffix active = new Suffix(tree.root, 0, -1); + for (int i = 0; i < text.length(); i++) { + tree.addPrefix(active, i); + } + return tree; + } + + private SuffixTree(Text text) { + this.text = text; + root = new Node(this, null); + } + + private void addPrefix(Suffix active, int endIndex) { + Node lastParentNode = null; + Node parentNode; + + while (true) { + Edge edge; + parentNode = active.getOriginNode(); + + // Step 1 is to try and find a matching edge for the given node. + // If a matching edge exists, we are done adding edges, so we break out of this big loop. + if (active.isExplicit()) { + edge = active.getOriginNode().findEdge(symbolAt(endIndex)); + if (edge != null) { + break; + } + } else { + // implicit node, a little more complicated + edge = active.getOriginNode().findEdge(symbolAt(active.getBeginIndex())); + int span = active.getSpan(); + if (Objects.equal(symbolAt(edge.getBeginIndex() + span + 1), symbolAt(endIndex))) { + break; + } + parentNode = edge.splitEdge(active); + } + + // We didn't find a matching edge, so we create a new one, add it to the tree at the parent node position, + // and insert it into the hash table. When we create a new node, it also means we need to create + // a suffix link to the new node from the last node we visited. + Edge newEdge = new Edge(endIndex, text.length() - 1, parentNode); + newEdge.insert(); + updateSuffixNode(lastParentNode, parentNode); + lastParentNode = parentNode; + + // This final step is where we move to the next smaller suffix + if (active.getOriginNode() == root) { + active.incBeginIndex(); + } else { + active.changeOriginNode(); + } + active.canonize(); + } + updateSuffixNode(lastParentNode, parentNode); + active.incEndIndex(); // Now the endpoint is the next active point + active.canonize(); + } + + private void updateSuffixNode(Node node, Node suffixNode) { + if ((node != null) && (node != root)) { + node.setSuffixNode(suffixNode); + } + } + + public Object symbolAt(int index) { + return text.symbolAt(index); + } + + public Node getRootNode() { + return root; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java new file mode 100644 index 00000000000..0bd13a88cbf --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithm.java @@ -0,0 +1,111 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.*; + +import org.sonar.duplications.block.Block; +import org.sonar.duplications.block.ByteArray; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.CloneIndex; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +public final class SuffixTreeCloneDetectionAlgorithm { + + public static List<CloneGroup> detect(CloneIndex cloneIndex, Collection<Block> fileBlocks) { + if (fileBlocks.isEmpty()) { + return Collections.EMPTY_LIST; + } + GeneralisedHashText text = retrieveFromIndex(cloneIndex, fileBlocks); + if (text == null) { + return Collections.EMPTY_LIST; + } + DuplicationsCollector reporter = new DuplicationsCollector(text); + Search.perform(text, reporter); + return reporter.getResult(); + } + + private SuffixTreeCloneDetectionAlgorithm() { + } + + private static GeneralisedHashText retrieveFromIndex(CloneIndex index, Collection<Block> fileBlocks) { + String originResourceId = fileBlocks.iterator().next().getResourceId(); + + Set<ByteArray> hashes = Sets.newHashSet(); + for (Block fileBlock : fileBlocks) { + hashes.add(fileBlock.getBlockHash()); + } + + Map<String, List<Block>> collection = Maps.newHashMap(); + for (ByteArray hash : hashes) { + Collection<Block> blocks = index.getBySequenceHash(hash); + for (Block blockFromIndex : blocks) { + // Godin: skip blocks for this file if they come from index + String resourceId = blockFromIndex.getResourceId(); + if (!originResourceId.equals(resourceId)) { + List<Block> list = collection.get(resourceId); + if (list == null) { + list = Lists.newArrayList(); + collection.put(resourceId, list); + } + list.add(blockFromIndex); + } + } + } + + if (collection.isEmpty() && hashes.size() == fileBlocks.size()) { // optimization for the case when there is no duplications + return null; + } + + GeneralisedHashText text = new GeneralisedHashText(); + List<Block> sortedFileBlocks = Lists.newArrayList(fileBlocks); + Collections.sort(sortedFileBlocks, BLOCK_COMPARATOR); + text.addAll(sortedFileBlocks); + text.addTerminator(); + + for (List<Block> list : collection.values()) { + Collections.sort(list, BLOCK_COMPARATOR); + + int i = 0; + while (i < list.size()) { + int j = i + 1; + while ((j < list.size()) && (list.get(j).getIndexInFile() == list.get(j - 1).getIndexInFile() + 1)) { + j++; + } + text.addAll(list.subList(i, j)); + text.addTerminator(); + i = j; + } + } + + text.finish(); + + return text; + } + + private static final Comparator<Block> BLOCK_COMPARATOR = new Comparator<Block>() { + public int compare(Block o1, Block o2) { + return o1.getIndexInFile() - o2.getIndexInFile(); + } + }; + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java new file mode 100644 index 00000000000..19b644d494a --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Text.java @@ -0,0 +1,41 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import java.util.List; + +/** + * Represents text. + */ +public interface Text { + + /** + * @return length of the sequence of symbols represented by this object + */ + int length(); + + /** + * @return symbol at the specified index + */ + Object symbolAt(int index); + + List<Object> sequence(int fromIndex, int toIndex); + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java new file mode 100644 index 00000000000..aa3628ed048 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java @@ -0,0 +1,73 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +/** + * Simplifies construction of <a href="http://en.wikipedia.org/wiki/Generalised_suffix_tree">generalised suffix-tree</a>. + */ +public class TextSet extends AbstractText { + + int[] lens; + + public TextSet(int size) { + super(100); // FIXME + + lens = new int[size]; + } + + public TextSet(Text... text) { + this(text.length); + for (int i = 0; i < text.length; i++) { + symbols.addAll(text[i].sequence(0, text[i].length())); + symbols.add(new Terminator(i)); + lens[i] = symbols.size(); + } + } + + public static class Terminator { + + private final int stringNumber; + + public Terminator(int i) { + this.stringNumber = i; + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof Terminator) && (((Terminator) obj).stringNumber == stringNumber); + } + + @Override + public int hashCode() { + return stringNumber; + } + + public int getStringNumber() { + return stringNumber; + } + + @Override + public String toString() { + return "$" + stringNumber; + } + + } + +} |