From b65a688104ad5ba6ef9db5033d173feaa4953f94 Mon Sep 17 00:00:00 2001 From: Evgeny Mandrikov Date: Thu, 8 Dec 2011 18:13:54 +0400 Subject: [PATCH] SONAR-3060 Refactor new CPD algorithm * Fix violations * Remove duplications * Add Javadocs * Method SortedListsUtils#contains now uses iterators, so doesn't require RandomAccess list in order to work efficiently in terms of performance. --- .../detector/ContainsInComparator.java | 91 ++++++++++++ .../detector/original/Filter.java | 38 +---- .../OriginalCloneDetectionAlgorithm.java | 16 ++- .../suffixtree/DuplicationsCollector.java | 131 +++++++----------- .../detector/suffixtree/Search.java | 43 ++++-- .../detector/suffixtree/SuffixTree.java | 19 ++- .../detector/suffixtree/TextSet.java | 30 ++-- .../duplications/utils/SortedListsUtils.java | 45 +++--- .../detector/ContainsInComparatorTest.java | 70 ++++++++++ .../detector/suffixtree/PrintCollector.java | 47 ------- .../detector/suffixtree/StringSuffixTree.java | 34 +++++ .../suffixtree/StringSuffixTreeTest.java | 106 -------------- ...SuffixTreeCloneDetectionAlgorithmTest.java | 40 ++++++ .../detector/suffixtree/SuffixTreeTest.java | 65 +++++++++ .../utils/SortedListsUtilsTest.java | 21 ++- 15 files changed, 463 insertions(+), 333 deletions(-) create mode 100644 sonar-duplications/src/main/java/org/sonar/duplications/detector/ContainsInComparator.java create mode 100644 sonar-duplications/src/test/java/org/sonar/duplications/detector/ContainsInComparatorTest.java delete mode 100644 sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java delete mode 100644 sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java create mode 100644 sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeTest.java diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/ContainsInComparator.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/ContainsInComparator.java new file mode 100644 index 00000000000..aec4e374948 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/ContainsInComparator.java @@ -0,0 +1,91 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector; + +import java.util.Comparator; + +import org.sonar.duplications.index.ClonePart; +import org.sonar.duplications.utils.FastStringComparator; + +/** + * Allows to determine if ClonePart includes another ClonePart. + * Inclusion is the partial order, so in fact this class violates contracts of {@link Comparator}, + * however it allows to use {@link org.sonar.duplications.utils.SortedListsUtils} for efficient filtering. + */ +public final class ContainsInComparator implements Comparator { + + /** + * Defines order by resourceId. + */ + public static final Comparator RESOURCE_ID_COMPARATOR = new Comparator() { + public int compare(ClonePart o1, ClonePart o2) { + return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); + } + }; + + /** + * Defines order by resourceId and by unitStart. + */ + public static final Comparator CLONEPART_COMPARATOR = new Comparator() { + public int compare(ClonePart o1, ClonePart o2) { + int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); + if (c == 0) { + return o1.getUnitStart() - o2.getUnitStart(); + } + return c; + } + }; + + private final int l1, l2; + + /** + * Constructs new comparator for two parts with lengths {@code l1} and {@code l2} respectively. + */ + public ContainsInComparator(int l1, int l2) { + this.l1 = l1; + this.l2 = l2; + } + + /** + * Compares two parts on inclusion. + * part1 includes part2 if {@code (part1.resourceId == part2.resourceId) && (part1.unitStart <= part2.unitStart) && (part2.unitEnd <= part1.unitEnd)}. + * + * @return 0 if part1 includes part2, + * 1 if resourceId of part1 is greater than resourceId of part2 or if unitStart of part1 is greater than unitStart of part2, + * -1 in all other cases + */ + public int compare(ClonePart part1, ClonePart part2) { + int c = RESOURCE_ID_COMPARATOR.compare(part1, part2); + if (c == 0) { + if (part1.getUnitStart() <= part2.getUnitStart()) { + if (part2.getUnitStart() + l2 <= part1.getUnitStart() + l1) { + return 0; // part1 contains part2 + } else { + return -1; // SortedListsUtils#contains should continue search + } + } else { + return 1; // unitStart of part1 is less than unitStart of part2 - SortedListsUtils#contains should stop search + } + } else { + return c; + } + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java index 3574b2fed2e..f9e1fecdec9 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java @@ -19,10 +19,10 @@ */ package org.sonar.duplications.detector.original; -import java.util.Comparator; import java.util.Iterator; import java.util.List; +import org.sonar.duplications.detector.ContainsInComparator; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.ClonePart; import org.sonar.duplications.utils.FastStringComparator; @@ -115,40 +115,8 @@ final class Filter { } List firstParts = first.getCloneParts(); List secondParts = second.getCloneParts(); - return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength())) - && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR); - } - - private static final Comparator RESOURCE_ID_COMPARATOR = new Comparator() { - public int compare(ClonePart o1, ClonePart o2) { - return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); - } - }; - - private static final class ContainsInComparator implements Comparator { - private final int l1, l2; - - public ContainsInComparator(int l1, int l2) { - this.l1 = l1; - this.l2 = l2; - } - - public int compare(ClonePart o1, ClonePart o2) { - int c = FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); - if (c == 0) { - if (o2.getUnitStart() <= o1.getUnitStart()) { - if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) { - return 0; // match found - stop search - } else { - return 1; // continue search - } - } else { - return -1; // o1 < o2 by unitStart - stop search - } - } else { - return c; - } - } + return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(second.getCloneUnitLength(), first.getCloneUnitLength())) + && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR); } } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java index c6eb043d2b1..9d451111731 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java @@ -206,18 +206,22 @@ public final class OriginalCloneDetectionAlgorithm { for (int i = 0; i < pairs.size(); i++) { Block[] pair = pairs.get(i); - ClonePart part = new ClonePart(pair[0].getResourceId(), pair[0].getIndexInFile(), pair[0].getFirstLineNumber(), pair[1].getLastLineNumber()); - parts.add(part); + Block firstBlock = pair[0]; + Block lastBlock = pair[1]; + ClonePart part = new ClonePart(firstBlock.getResourceId(), + firstBlock.getIndexInFile(), + firstBlock.getFirstLineNumber(), + lastBlock.getLastLineNumber()); if (originResourceId.equals(part.getResourceId())) { if (origin == null) { origin = part; - } else { - if (part.getUnitStart() < origin.getUnitStart()) { - origin = part; - } + } else if (part.getUnitStart() < origin.getUnitStart()) { + origin = part; } } + + parts.add(part); } filter.add(new CloneGroup(cloneLength, origin, parts)); diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java index c6965648bb3..2bdbbb3b98e 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java @@ -20,14 +20,12 @@ package org.sonar.duplications.detector.suffixtree; import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; import java.util.List; import org.sonar.duplications.block.Block; +import org.sonar.duplications.detector.ContainsInComparator; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.ClonePart; -import org.sonar.duplications.utils.FastStringComparator; import org.sonar.duplications.utils.SortedListsUtils; import com.google.common.collect.Lists; @@ -35,21 +33,16 @@ import com.google.common.collect.Lists; /** * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s. */ -public class DuplicationsCollector implements Search.Collector { +public class DuplicationsCollector extends Search.Collector { private final TextSet text; private final String originResourceId; - /** - * Note that LinkedList should provide better performance here, because of use of operation remove. - * - * @see #filter(CloneGroup) - */ - private final List filtered = Lists.newLinkedList(); + private final List filtered = Lists.newArrayList(); private int length; private ClonePart origin; - private final List parts = Lists.newArrayList(); + private List parts; public DuplicationsCollector(TextSet text) { this.text = text; @@ -63,8 +56,21 @@ public class DuplicationsCollector implements Search.Collector { return filtered; } - public void part(int start, int end, int len) { - length = len; + @Override + public void startOfGroup(int size, int length) { + this.length = length; + this.parts = Lists.newArrayListWithCapacity(size); + } + + /** + * Constructs ClonePart and saves it for future processing in {@link #endOfGroup()}. + * + * @param start number of first block from text for this part + * @param end number of last block from text for this part + * @param len number of blocks in this part + */ + @Override + public void part(int start, int end) { Block firstBlock = text.getBlock(start); Block lastBlock = text.getBlock(end - 1); @@ -86,56 +92,46 @@ public class DuplicationsCollector implements Search.Collector { parts.add(part); } + /** + * Constructs CloneGroup and saves it. + */ + @Override public void endOfGroup() { - Collections.sort(parts, CLONEPART_COMPARATOR); + Collections.sort(parts, ContainsInComparator.CLONEPART_COMPARATOR); CloneGroup group = new CloneGroup(length, origin, parts); filter(group); - parts.clear(); + parts = null; origin = null; } + /** + * Saves CloneGroup, if it is not included into previously saved. + *

+ * Current CloneGroup can not include none of CloneGroup, which were constructed before. + * Proof: + * According to an order of visiting nodes in suffix tree - length of earlier >= length of current. + * If length of earlier > length of current, then earlier not contained in current. + * If length of earlier = length of current, then earlier can be contained in current only + * when current has exactly the same and maybe some additional CloneParts as earlier, + * what in his turn will mean that two inner-nodes on same depth will satisfy condition + * current.startSize <= earlier.startSize <= earlier.endSize <= current.endSize , which is not possible for different inner-nodes on same depth. + *

+ * Thus this method checks only that none of CloneGroup, which was constructed before, does not include current CloneGroup. + */ private void filter(CloneGroup current) { - Iterator i = filtered.iterator(); - while (i.hasNext()) { - CloneGroup earlier = i.next(); - // Note that following two conditions cannot be true together - proof by contradiction: - // let C be the current clone and A and B were found earlier - // then since relation is transitive - (A in C) and (C in B) => (A in B) - // so A should be filtered earlier + for (CloneGroup earlier : filtered) { if (containsIn(current, earlier)) { - // current clone fully covered by clone, which was found earlier return; } - // TODO Godin: must prove that this is unused - // if (containsIn(earlier, current)) { - // // current clone fully covers clone, which was found earlier - // i.remove(); - // } } filtered.add(current); } - private static final Comparator CLONEPART_COMPARATOR = new Comparator() { - public int compare(ClonePart o1, ClonePart o2) { - int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); - if (c == 0) { - return o1.getUnitStart() - o2.getUnitStart(); - } - return c; - } - }; - - private static final Comparator RESOURCE_ID_COMPARATOR = new Comparator() { - public int compare(ClonePart o1, ClonePart o2) { - return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); - } - }; - /** - * Checks that second clone contains first one. + * Checks that second CloneGroup includes first one. *

- * Clone A is contained in another clone B, if every part pA from A has part pB in B, + * CloneGroup A is included in another CloneGroup B, if every part pA from A has part pB in B, * which satisfy the conditions: *

    * (pA.resourceId == pB.resourceId) and (pB.unitStart <= pA.unitStart) and (pA.unitEnd <= pB.unitEnd)
@@ -145,7 +141,7 @@ public class DuplicationsCollector implements Search.Collector {
    * 
    * pB.resourceId == pA.resourceId
    * 
- * So this relation is: + * Inclusion is the partial order, thus this relation is: *
    *
  • reflexive - A in A
  • *
  • transitive - (A in B) and (B in C) => (A in C)
  • @@ -153,48 +149,17 @@ public class DuplicationsCollector implements Search.Collector { *
*

*

- * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link #CLONEPART_COMPARATOR}), + * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link ContainsInComparator#CLONEPART_COMPARATOR}), * so running time - O(|A|+|B|). *

*/ private static boolean containsIn(CloneGroup first, CloneGroup second) { - // TODO Godin: must prove that this is unused - // if (!first.getOriginPart().getResourceId().equals(second.getOriginPart().getResourceId())) { - // return false; - // } - // if (first.getCloneUnitLength() > second.getCloneUnitLength()) { - // return false; - // } List firstParts = first.getCloneParts(); List secondParts = second.getCloneParts(); - return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength())) - && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR); - } - - private static class ContainsInComparator implements Comparator { - private final int l1, l2; - - public ContainsInComparator(int l1, int l2) { - this.l1 = l1; - this.l2 = l2; - } - - public int compare(ClonePart o1, ClonePart o2) { - int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); - if (c == 0) { - if (o2.getUnitStart() <= o1.getUnitStart()) { - if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) { - return 0; // match found - stop search - } else { - return 1; // continue search - } - } else { - return -1; // o1 < o2 by unitStart - stop search - } - } else { - return c; - } - } + // TODO Godin: according to tests seems that if first part of condition is true, then second part can not be false + // if this can be proved, then second part can be removed + return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(second.getCloneUnitLength(), first.getCloneUnitLength())) + && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR); } } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java index 80d714339d4..d0c32f74765 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java @@ -24,19 +24,19 @@ import java.util.*; public final class Search { private final SuffixTree tree; - private final int[] lens; + private final TextSet text; private final Collector reporter; private final List list = new ArrayList(); private final List innerNodes = new ArrayList(); public static void perform(TextSet text, Collector reporter) { - new Search(SuffixTree.create(text), text.getLens(), reporter).compute(); + new Search(SuffixTree.create(text), text, reporter).compute(); } - private Search(SuffixTree tree, int[] lens, Collector reporter) { + private Search(SuffixTree tree, TextSet text, Collector reporter) { this.tree = tree; - this.lens = lens; + this.text = text; this.reporter = reporter; } @@ -102,11 +102,17 @@ public final class Search { } } + /** + * TODO Godin: in fact computations here are the same as in {@link #report(Node)}, + * so maybe would be better to remove this duplication, + * however it should be noted that this check can't be done in {@link Collector#endOfGroup()}, + * because it might lead to creation of unnecessary new objects + */ private boolean containsOrigin(Node node) { for (int i = node.startSize; i < node.endSize; i++) { int start = tree.text.length() - list.get(i); int end = start + node.depth; - if (end < lens[0]) { + if (text.isInsideOrigin(end)) { return true; } } @@ -114,23 +120,42 @@ public final class Search { } private void report(Node node) { + reporter.startOfGroup(node.endSize - node.startSize, node.depth); for (int i = node.startSize; i < node.endSize; i++) { int start = tree.text.length() - list.get(i); int end = start + node.depth; - reporter.part(start, end, node.depth); + reporter.part(start, end); } reporter.endOfGroup(); } - public interface Collector { + public static abstract class Collector { /** + * Invoked at the beginning of processing for current node. + *

+ * Length - is a depth of node. And nodes are visited in descending order of depth, + * thus we guaranty that length will not increase between two sequential calls of this method + * (can be equal or less than previous value). + *

+ * + * @param size number of parts in group + * @param length length of each part in group + */ + abstract void startOfGroup(int size, int length); + + /** + * Invoked as many times as leafs in the subtree, where current node is root. + * * @param start start position in generalised text * @param end end position in generalised text */ - void part(int start, int end, int len); + abstract void part(int start, int end); - void endOfGroup(); + /** + * Invoked at the end of processing for current node. + */ + abstract void endOfGroup(); } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java index b6e21c940c7..e79137dc10a 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java @@ -22,9 +22,24 @@ package org.sonar.duplications.detector.suffixtree; import com.google.common.base.Objects; /** - * The implementation of the algorithm for constructing suffix-tree based on - * Java-port of + * Provides algorithm to construct suffix tree. + *

+ * Suffix tree for the string S of length n is defined as a tree such that: + *

    + *
  • the paths from the root to the leaves have a one-to-one relationship with the suffixes of S,
  • + *
  • edges spell non-empty strings,
  • + *
  • and all internal nodes (except perhaps the root) have at least two children.
  • + *
+ * Since such a tree does not exist for all strings, S is padded with a terminal symbol not seen in the string (usually denoted $). + * This ensures that no suffix is a prefix of another, and that there will be n leaf nodes, one for each of the n suffixes of S. + * Since all internal non-root nodes are branching, there can be at most n − 1 such nodes, and n + (n − 1) + 1 = 2n nodes in total. + * All internal nodes and leafs have incoming edge, so number of edges equal to number of leafs plus number of inner nodes, + * thus at most 2n - 1. + * Construction takes O(n) time. + *

+ * This implementation was adapted from Java-port of * Mark Nelson's C++ implementation of Ukkonen's algorithm. + *

*/ public final class SuffixTree { diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java index 4daccb1776b..033dc879a44 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java @@ -28,28 +28,28 @@ import com.google.common.collect.Lists; /** * Simplifies construction of generalised suffix-tree. */ -public class TextSet extends AbstractText { +public final class TextSet extends AbstractText { - public static class Builder { + public static final class Builder { private List symbols = Lists.newArrayList(); - private List sizes = Lists.newArrayList(); + private Integer lengthOfOrigin; + private int count; private Builder() { } public void add(List list) { symbols.addAll(list); - symbols.add(new Terminator(sizes.size())); - sizes.add(symbols.size()); + symbols.add(new Terminator(count)); + count++; + if (lengthOfOrigin == null) { + lengthOfOrigin = symbols.size(); + } } public TextSet build() { - int[] lens = new int[sizes.size()]; - for (int i = 0; i < sizes.size(); i++) { - lens[i] = sizes.get(i); - } - return new TextSet(symbols, lens); + return new TextSet(symbols, lengthOfOrigin); } } @@ -58,15 +58,15 @@ public class TextSet extends AbstractText { return new Builder(); } - private int[] lens; + private final int lengthOfOrigin; - private TextSet(List symbols, int[] lens) { + private TextSet(List symbols, int lengthOfOrigin) { super(symbols); - this.lens = lens; + this.lengthOfOrigin = lengthOfOrigin; } - public int[] getLens() { - return lens; + public boolean isInsideOrigin(int pos) { + return pos < lengthOfOrigin; } @Override diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java b/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java index 39b482e61ae..4efa994c329 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java @@ -20,6 +20,7 @@ package org.sonar.duplications.utils; import java.util.Comparator; +import java.util.Iterator; import java.util.List; /** @@ -28,32 +29,40 @@ import java.util.List; public final class SortedListsUtils { /** - * Returns true if all elements from second list also presented in first list. - * Both lists must be sorted. - * And both must implement {@link java.util.RandomAccess}, otherwise this method is inefficient in terms of performance. + * Returns true if {@code container} contains all elements from {@code list}. + * Both lists must be sorted in consistency with {@code comparator}, + * that is for any two sequential elements x and y: + * {@code (comparator.compare(x, y) <= 0) && (comparator.compare(y, x) >= 0)}. * Running time - O(|container| + |list|). */ public static boolean contains(List container, List list, Comparator comparator) { - int j = 0; - for (int i = 0; i < list.size(); i++) { - T e1 = list.get(i); - boolean found = false; - for (; j < container.size(); j++) { - T e2 = container.get(j); - int c = comparator.compare(e1, e2); - if (c == 0) { - found = true; - break; - } else if (c < 0) { - // e1 is less than e2 - stop search, because all next elements e2 would be greater than e1 + Iterator listIterator = list.iterator(); + Iterator containerIterator = container.iterator(); + T listElement = listIterator.next(); + T containerElement = containerIterator.next(); + while (true) { + int r = comparator.compare(containerElement, listElement); + if (r == 0) { + // current element from list is equal to current element from container + if (!listIterator.hasNext()) { + // no elements remaining in list - all were matched + return true; + } + // next element from list also can be equal to current element from container + listElement = listIterator.next(); + } else if (r < 0) { + // current element from list is greater than current element from container + // need to check next element from container + if (!containerIterator.hasNext()) { return false; } - } - if (!found) { + containerElement = containerIterator.next(); + } else { + // current element from list is less than current element from container + // stop search, because current element from list would be less than any next element from container return false; } } - return true; } private SortedListsUtils() { diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/ContainsInComparatorTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/ContainsInComparatorTest.java new file mode 100644 index 00000000000..6933d903545 --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/ContainsInComparatorTest.java @@ -0,0 +1,70 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector; + +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.*; + +import java.util.Comparator; + +import org.junit.Test; +import org.sonar.duplications.index.ClonePart; + +public class ContainsInComparatorTest { + + @Test + public void shouldCompareByResourceId() { + Comparator comparator = ContainsInComparator.RESOURCE_ID_COMPARATOR; + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("b", 0)), is(-1)); + assertThat(comparator.compare(newClonePart("b", 0), newClonePart("a", 0)), is(1)); + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("a", 0)), is(0)); + } + + @Test + public void shouldCompareByResourceIdAndUnitStart() { + Comparator comparator = ContainsInComparator.CLONEPART_COMPARATOR; + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("b", 0)), is(-1)); + assertThat(comparator.compare(newClonePart("b", 0), newClonePart("a", 0)), is(1)); + + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("a", 0)), is(0)); + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("a", 1)), is(-1)); + assertThat(comparator.compare(newClonePart("a", 1), newClonePart("a", 0)), is(1)); + } + + @Test + public void shouldCompare() { + assertThat(compare("a", 0, 0, "b", 0, 0), is(-1)); + assertThat(compare("b", 0, 0, "a", 0, 0), is(1)); + + assertThat(compare("a", 0, 0, "a", 0, 0), is(0)); + assertThat(compare("a", 1, 0, "a", 0, 0), is(1)); + assertThat(compare("a", 0, 0, "a", 0, 1), is(-1)); + } + + private static int compare(String resourceId1, int start1, int end1, String resourceId2, int start2, int end2) { + return new ContainsInComparator(end1 - start1, end2 - start2) + .compare(newClonePart(resourceId1, start1), newClonePart(resourceId2, start2)); + } + + private static ClonePart newClonePart(String resourceId, int unitStart) { + return new ClonePart(resourceId, unitStart, 0, 0); + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java deleted file mode 100644 index ab4b0b9cd8f..00000000000 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Sonar, open source software quality management tool. - * Copyright (C) 2008-2011 SonarSource - * mailto:contact AT sonarsource DOT com - * - * Sonar is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * Sonar is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Sonar; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 - */ -package org.sonar.duplications.detector.suffixtree; - -import org.sonar.duplications.detector.suffixtree.Search; -import org.sonar.duplications.detector.suffixtree.TextSet; - -public final class PrintCollector implements Search.Collector { - - private final TextSet text; - private int groups; - - public PrintCollector(TextSet text) { - this.text = text; - } - - public void part(int start, int end, int len) { - System.out.println(start + " " + end + " : " + text.sequence(start, end)); - } - - public void endOfGroup() { - groups++; - System.out.println(); - } - - public int getGroups() { - return groups; - } - -} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java index 80daa78225b..78d160c60a3 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java @@ -19,6 +19,9 @@ */ package org.sonar.duplications.detector.suffixtree; +import java.util.LinkedList; +import java.util.Queue; + import org.sonar.duplications.detector.suffixtree.Edge; import org.sonar.duplications.detector.suffixtree.Node; import org.sonar.duplications.detector.suffixtree.SuffixTree; @@ -29,6 +32,9 @@ import com.google.common.base.Objects; public class StringSuffixTree { private final SuffixTree suffixTree; + private int numberOfEdges; + private int numberOfInnerNodes; + private int numberOfLeafs; public static StringSuffixTree create(String text) { return new StringSuffixTree(text); @@ -36,6 +42,34 @@ public class StringSuffixTree { private StringSuffixTree(String text) { suffixTree = SuffixTree.create(new StringText(text)); + + Queue queue = new LinkedList(); + queue.add(suffixTree.getRootNode()); + while (!queue.isEmpty()) { + Node node = queue.remove(); + if (node.getEdges().isEmpty()) { + numberOfLeafs++; + } else { + numberOfInnerNodes++; + for (Edge edge : node.getEdges()) { + numberOfEdges++; + queue.add(edge.getEndNode()); + } + } + } + numberOfInnerNodes--; // without root + } + + public int getNumberOfEdges() { + return numberOfEdges; + } + + public int getNumberOfInnerNodes() { + return numberOfInnerNodes; + } + + public int getNumberOfLeafs() { + return numberOfLeafs; } public int indexOf(String str) { diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java deleted file mode 100644 index 7cd796c755f..00000000000 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Sonar, open source software quality management tool. - * Copyright (C) 2008-2011 SonarSource - * mailto:contact AT sonarsource DOT com - * - * Sonar is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * Sonar is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Sonar; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 - */ -package org.sonar.duplications.detector.suffixtree; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import org.junit.Test; - -public class StringSuffixTreeTest { - - @Test - public void testMississippi() { - StringSuffixTree st = StringSuffixTree.create("mississippi"); - - assertTrue(st.contains("miss")); - assertTrue(st.contains("missis")); - assertTrue(st.contains("pi")); - } - - @Test - public void testBanana() { - StringSuffixTree st = StringSuffixTree.create("banana"); - - assertTrue(st.contains("ana")); - assertTrue(st.contains("an")); - assertTrue(st.contains("na")); - } - - @Test - public void testBook() { - StringSuffixTree st = StringSuffixTree.create("book"); - - assertTrue(st.contains("book")); - assertTrue(st.contains("oo")); - assertTrue(st.contains("ok")); - assertFalse(st.contains("okk")); - assertFalse(st.contains("bookk")); - assertFalse(st.contains("bok")); - - assertEquals(0, st.indexOf("book")); - assertEquals(1, st.indexOf("oo")); - assertEquals(2, st.indexOf("ok")); - } - - @Test - public void testBookke() { - StringSuffixTree st = StringSuffixTree.create("bookke"); - - assertTrue(st.contains("bookk")); - - assertEquals(0, st.indexOf("book")); - assertEquals(1, st.indexOf("oo")); - assertEquals(2, st.indexOf("ok")); - } - - @Test - public void testCacao() { - StringSuffixTree st = StringSuffixTree.create("cacao"); - - assertTrue(st.contains("aca")); - - assertEquals(3, st.indexOf("ao")); - assertEquals(0, st.indexOf("ca")); - assertEquals(2, st.indexOf("cao")); - } - - @Test - public void testGoogol() { - StringSuffixTree st = StringSuffixTree.create("googol"); - - assertTrue(st.contains("oo")); - - assertEquals(0, st.indexOf("go")); - assertEquals(3, st.indexOf("gol")); - assertEquals(1, st.indexOf("oo")); - } - - @Test - public void testAbababc() { - StringSuffixTree st = StringSuffixTree.create("abababc"); - - assertTrue(st.contains("aba")); - - assertEquals(0, st.indexOf("aba")); - assertEquals(4, st.indexOf("abc")); - } -} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java index d094196e528..1a3624dadd5 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java @@ -29,10 +29,13 @@ import java.util.List; import org.junit.Test; import org.sonar.duplications.block.Block; +import org.sonar.duplications.block.ByteArray; import org.sonar.duplications.detector.DetectorTestCase; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.CloneIndex; +import com.google.common.collect.Lists; + public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { /** @@ -47,6 +50,35 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { assertThat(result, sameInstance(Collections.EMPTY_LIST)); } + /** + * See SONAR-3060 + *

+ * In case when file contains a lot of duplicated blocks suffix-tree works better than original algorithm, + * which works more than 5 minutes for this example. + *

+ * However should be noted that current implementation with suffix-tree also is not optimal, + * even if it works for this example couple of seconds, + * because duplications should be filtered in order to remove fully-covered. + * Moreover - height of the tree grows, depending on the number of blocks, which might lead to StackOverflowError, + * because algorithm uses recursion. + * But such cases nearly never appear in real-world, so current implementation is acceptable for the moment. + *

+ */ + @Test + public void huge() { + CloneIndex index = createIndex(); + List fileBlocks = Lists.newArrayList(); + int indexInFile = 0; + for (int i = 0; i < 5000; i++) { + Block block = newBlock("x", new ByteArray("01"), indexInFile); + fileBlocks.add(block); + indexInFile++; + } + List result = detect(index, fileBlocks); + + assertEquals(1, result.size()); + } + /** * Given: *
@@ -57,6 +89,7 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase {
    * x-x (2 2)
    * x-x-x-x-x (2)
    * 
+   * TODO Godin: however would be better to receive only (2)
    */
   @Test
   public void myTest() {
@@ -80,6 +113,9 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase {
   }
 
   /**
+   * This test and associated with it suffix-tree demonstrates that without filtering in {@link DuplicationsCollector#endOfGroup()}
+   * possible to construct {@link CloneGroup}, which is fully covered by another {@link CloneGroup}.
+   * 
    * Given:
    * 
    * x: a 2 3 b 2 3 c 2 3 d 2 3 2 3 2 3
@@ -89,6 +125,7 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase {
    * x-x (2 3 2 3)
    * x-x-x-x-x-x (2 3)
    * 
+   * TODO Godin: however would be better to receive only (2 3)
    */
   @Test
   public void myTest2() {
@@ -113,6 +150,9 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase {
   }
 
   /**
+   * This test and associated with it suffix-tree demonstrates that without check of origin in {@link Search}
+   * possible to construct {@link CloneGroup} with a wrong origin.
+   * 
    * Given:
    * 
    * a: 1 2 3 4
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeTest.java
new file mode 100644
index 00000000000..d1f81e3735b
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeTest.java
@@ -0,0 +1,65 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2011 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
+ */
+package org.sonar.duplications.detector.suffixtree;
+
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.lessThan;
+import static org.junit.Assert.assertThat;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(Parameterized.class)
+public class SuffixTreeTest {
+
+  @Parameters
+  public static Collection generateData() {
+    return Arrays.asList(new Object[][] { { "banana" }, { "mississippi" }, { "book" }, { "bookke" }, { "cacao" }, { "googol" }, { "abababc" }, { "aaaaa" } });
+  }
+
+  private final String data;
+
+  public SuffixTreeTest(String data) {
+    this.data = data;
+  }
+
+  @Test
+  public void test() {
+    String text = this.data + "$";
+    StringSuffixTree tree = StringSuffixTree.create(text);
+
+    assertThat("number of leafs", tree.getNumberOfLeafs(), is(text.length()));
+    assertThat("number of inner nodes", tree.getNumberOfInnerNodes(), lessThan(text.length() - 1));
+    assertThat("number of edges", tree.getNumberOfEdges(), is(tree.getNumberOfInnerNodes() + tree.getNumberOfLeafs()));
+
+    for (int beginIndex = 0; beginIndex < text.length(); beginIndex++) {
+      for (int endIndex = beginIndex + 1; endIndex < text.length() + 1; endIndex++) {
+        String substring = text.substring(beginIndex, endIndex);
+        assertThat("index of " + substring + " in " + text, tree.indexOf(substring), is(text.indexOf(substring)));
+      }
+    }
+  }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java
index cf099af6eab..c37c6feed4d 100644
--- a/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java
@@ -32,21 +32,18 @@ public class SortedListsUtilsTest {
 
   @Test
   public void testContains() {
-    List c1 = Arrays.asList(1, 2, 3);
-    List c2 = Arrays.asList(1, 2);
-    List c3 = Arrays.asList(1, 3);
+    assertThat(contains(Arrays.asList(1, 2, 3), Arrays.asList(1, 2)), is(true));
+    assertThat(contains(Arrays.asList(1, 2), Arrays.asList(1, 2, 3)), is(false));
 
-    assertThat(SortedListsUtils.contains(c1, c1, IntegerComparator.INSTANCE), is(true));
-    assertThat(SortedListsUtils.contains(c1, c2, IntegerComparator.INSTANCE), is(true));
-    assertThat(SortedListsUtils.contains(c1, c3, IntegerComparator.INSTANCE), is(true));
+    assertThat(contains(Arrays.asList(1, 2, 3), Arrays.asList(1, 3)), is(true));
+    assertThat(contains(Arrays.asList(1, 3), Arrays.asList(1, 2, 3)), is(false));
 
-    assertThat(SortedListsUtils.contains(c2, c1, IntegerComparator.INSTANCE), is(false));
-    assertThat(SortedListsUtils.contains(c2, c2, IntegerComparator.INSTANCE), is(true));
-    assertThat(SortedListsUtils.contains(c2, c3, IntegerComparator.INSTANCE), is(false));
+    assertThat(contains(Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 2, 3)), is(true));
+    assertThat(contains(Arrays.asList(1, 2, 2, 3), Arrays.asList(1, 2, 3)), is(true));
+  }
 
-    assertThat(SortedListsUtils.contains(c3, c1, IntegerComparator.INSTANCE), is(false));
-    assertThat(SortedListsUtils.contains(c3, c2, IntegerComparator.INSTANCE), is(false));
-    assertThat(SortedListsUtils.contains(c3, c3, IntegerComparator.INSTANCE), is(true));
+  private static boolean contains(List a, List b) {
+    return SortedListsUtils.contains(a, b, IntegerComparator.INSTANCE);
   }
 
   private static class IntegerComparator implements Comparator {
-- 
2.39.5