diff options
author | Evgeny Mandrikov <mandrikov@gmail.com> | 2011-12-08 18:13:54 +0400 |
---|---|---|
committer | Evgeny Mandrikov <mandrikov@gmail.com> | 2011-12-14 15:18:55 +0400 |
commit | b65a688104ad5ba6ef9db5033d173feaa4953f94 (patch) | |
tree | 8e81537127fc8640842d398e3e1521df28e57379 /sonar-duplications | |
parent | bb89464a5f8b68591ada343240a99f9fe7418d87 (diff) | |
download | sonarqube-b65a688104ad5ba6ef9db5033d173feaa4953f94.tar.gz sonarqube-b65a688104ad5ba6ef9db5033d173feaa4953f94.zip |
SONAR-3060 Refactor new CPD algorithm
* Fix violations
* Remove duplications
* Add Javadocs
* Method SortedListsUtils#contains now uses iterators, so doesn't
require RandomAccess list in order to work efficiently in terms of
performance.
Diffstat (limited to 'sonar-duplications')
15 files changed, 463 insertions, 333 deletions
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/ContainsInComparator.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/ContainsInComparator.java new file mode 100644 index 00000000000..aec4e374948 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/ContainsInComparator.java @@ -0,0 +1,91 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector; + +import java.util.Comparator; + +import org.sonar.duplications.index.ClonePart; +import org.sonar.duplications.utils.FastStringComparator; + +/** + * Allows to determine if ClonePart includes another ClonePart. + * Inclusion is the partial order, so in fact this class violates contracts of {@link Comparator}, + * however it allows to use {@link org.sonar.duplications.utils.SortedListsUtils} for efficient filtering. + */ +public final class ContainsInComparator implements Comparator<ClonePart> { + + /** + * Defines order by resourceId. + */ + public static final Comparator<ClonePart> RESOURCE_ID_COMPARATOR = new Comparator<ClonePart>() { + public int compare(ClonePart o1, ClonePart o2) { + return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); + } + }; + + /** + * Defines order by resourceId and by unitStart. + */ + public static final Comparator<ClonePart> CLONEPART_COMPARATOR = new Comparator<ClonePart>() { + public int compare(ClonePart o1, ClonePart o2) { + int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); + if (c == 0) { + return o1.getUnitStart() - o2.getUnitStart(); + } + return c; + } + }; + + private final int l1, l2; + + /** + * Constructs new comparator for two parts with lengths {@code l1} and {@code l2} respectively. + */ + public ContainsInComparator(int l1, int l2) { + this.l1 = l1; + this.l2 = l2; + } + + /** + * Compares two parts on inclusion. + * part1 includes part2 if {@code (part1.resourceId == part2.resourceId) && (part1.unitStart <= part2.unitStart) && (part2.unitEnd <= part1.unitEnd)}. + * + * @return 0 if part1 includes part2, + * 1 if resourceId of part1 is greater than resourceId of part2 or if unitStart of part1 is greater than unitStart of part2, + * -1 in all other cases + */ + public int compare(ClonePart part1, ClonePart part2) { + int c = RESOURCE_ID_COMPARATOR.compare(part1, part2); + if (c == 0) { + if (part1.getUnitStart() <= part2.getUnitStart()) { + if (part2.getUnitStart() + l2 <= part1.getUnitStart() + l1) { + return 0; // part1 contains part2 + } else { + return -1; // SortedListsUtils#contains should continue search + } + } else { + return 1; // unitStart of part1 is less than unitStart of part2 - SortedListsUtils#contains should stop search + } + } else { + return c; + } + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java index 3574b2fed2e..f9e1fecdec9 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/Filter.java @@ -19,10 +19,10 @@ */ package org.sonar.duplications.detector.original; -import java.util.Comparator; import java.util.Iterator; import java.util.List; +import org.sonar.duplications.detector.ContainsInComparator; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.ClonePart; import org.sonar.duplications.utils.FastStringComparator; @@ -115,40 +115,8 @@ final class Filter { } List<ClonePart> firstParts = first.getCloneParts(); List<ClonePart> secondParts = second.getCloneParts(); - return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength())) - && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR); - } - - private static final Comparator<ClonePart> RESOURCE_ID_COMPARATOR = new Comparator<ClonePart>() { - public int compare(ClonePart o1, ClonePart o2) { - return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); - } - }; - - private static final class ContainsInComparator implements Comparator<ClonePart> { - private final int l1, l2; - - public ContainsInComparator(int l1, int l2) { - this.l1 = l1; - this.l2 = l2; - } - - public int compare(ClonePart o1, ClonePart o2) { - int c = FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); - if (c == 0) { - if (o2.getUnitStart() <= o1.getUnitStart()) { - if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) { - return 0; // match found - stop search - } else { - return 1; // continue search - } - } else { - return -1; // o1 < o2 by unitStart - stop search - } - } else { - return c; - } - } + return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(second.getCloneUnitLength(), first.getCloneUnitLength())) + && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR); } } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java index c6eb043d2b1..9d451111731 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/original/OriginalCloneDetectionAlgorithm.java @@ -206,18 +206,22 @@ public final class OriginalCloneDetectionAlgorithm { for (int i = 0; i < pairs.size(); i++) { Block[] pair = pairs.get(i); - ClonePart part = new ClonePart(pair[0].getResourceId(), pair[0].getIndexInFile(), pair[0].getFirstLineNumber(), pair[1].getLastLineNumber()); - parts.add(part); + Block firstBlock = pair[0]; + Block lastBlock = pair[1]; + ClonePart part = new ClonePart(firstBlock.getResourceId(), + firstBlock.getIndexInFile(), + firstBlock.getFirstLineNumber(), + lastBlock.getLastLineNumber()); if (originResourceId.equals(part.getResourceId())) { if (origin == null) { origin = part; - } else { - if (part.getUnitStart() < origin.getUnitStart()) { - origin = part; - } + } else if (part.getUnitStart() < origin.getUnitStart()) { + origin = part; } } + + parts.add(part); } filter.add(new CloneGroup(cloneLength, origin, parts)); diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java index c6965648bb3..2bdbbb3b98e 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java @@ -20,14 +20,12 @@ package org.sonar.duplications.detector.suffixtree; import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; import java.util.List; import org.sonar.duplications.block.Block; +import org.sonar.duplications.detector.ContainsInComparator; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.ClonePart; -import org.sonar.duplications.utils.FastStringComparator; import org.sonar.duplications.utils.SortedListsUtils; import com.google.common.collect.Lists; @@ -35,21 +33,16 @@ import com.google.common.collect.Lists; /** * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s. */ -public class DuplicationsCollector implements Search.Collector { +public class DuplicationsCollector extends Search.Collector { private final TextSet text; private final String originResourceId; - /** - * Note that LinkedList should provide better performance here, because of use of operation remove. - * - * @see #filter(CloneGroup) - */ - private final List<CloneGroup> filtered = Lists.newLinkedList(); + private final List<CloneGroup> filtered = Lists.newArrayList(); private int length; private ClonePart origin; - private final List<ClonePart> parts = Lists.newArrayList(); + private List<ClonePart> parts; public DuplicationsCollector(TextSet text) { this.text = text; @@ -63,8 +56,21 @@ public class DuplicationsCollector implements Search.Collector { return filtered; } - public void part(int start, int end, int len) { - length = len; + @Override + public void startOfGroup(int size, int length) { + this.length = length; + this.parts = Lists.newArrayListWithCapacity(size); + } + + /** + * Constructs ClonePart and saves it for future processing in {@link #endOfGroup()}. + * + * @param start number of first block from text for this part + * @param end number of last block from text for this part + * @param len number of blocks in this part + */ + @Override + public void part(int start, int end) { Block firstBlock = text.getBlock(start); Block lastBlock = text.getBlock(end - 1); @@ -86,56 +92,46 @@ public class DuplicationsCollector implements Search.Collector { parts.add(part); } + /** + * Constructs CloneGroup and saves it. + */ + @Override public void endOfGroup() { - Collections.sort(parts, CLONEPART_COMPARATOR); + Collections.sort(parts, ContainsInComparator.CLONEPART_COMPARATOR); CloneGroup group = new CloneGroup(length, origin, parts); filter(group); - parts.clear(); + parts = null; origin = null; } + /** + * Saves CloneGroup, if it is not included into previously saved. + * <p> + * Current CloneGroup can not include none of CloneGroup, which were constructed before. + * Proof: + * According to an order of visiting nodes in suffix tree - length of earlier >= length of current. + * If length of earlier > length of current, then earlier not contained in current. + * If length of earlier = length of current, then earlier can be contained in current only + * when current has exactly the same and maybe some additional CloneParts as earlier, + * what in his turn will mean that two inner-nodes on same depth will satisfy condition + * current.startSize <= earlier.startSize <= earlier.endSize <= current.endSize , which is not possible for different inner-nodes on same depth. + * </p> + * Thus this method checks only that none of CloneGroup, which was constructed before, does not include current CloneGroup. + */ private void filter(CloneGroup current) { - Iterator<CloneGroup> i = filtered.iterator(); - while (i.hasNext()) { - CloneGroup earlier = i.next(); - // Note that following two conditions cannot be true together - proof by contradiction: - // let C be the current clone and A and B were found earlier - // then since relation is transitive - (A in C) and (C in B) => (A in B) - // so A should be filtered earlier + for (CloneGroup earlier : filtered) { if (containsIn(current, earlier)) { - // current clone fully covered by clone, which was found earlier return; } - // TODO Godin: must prove that this is unused - // if (containsIn(earlier, current)) { - // // current clone fully covers clone, which was found earlier - // i.remove(); - // } } filtered.add(current); } - private static final Comparator<ClonePart> CLONEPART_COMPARATOR = new Comparator<ClonePart>() { - public int compare(ClonePart o1, ClonePart o2) { - int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); - if (c == 0) { - return o1.getUnitStart() - o2.getUnitStart(); - } - return c; - } - }; - - private static final Comparator<ClonePart> RESOURCE_ID_COMPARATOR = new Comparator<ClonePart>() { - public int compare(ClonePart o1, ClonePart o2) { - return FastStringComparator.INSTANCE.compare(o1.getResourceId(), o2.getResourceId()); - } - }; - /** - * Checks that second clone contains first one. + * Checks that second CloneGroup includes first one. * <p> - * Clone A is contained in another clone B, if every part pA from A has part pB in B, + * CloneGroup A is included in another CloneGroup B, if every part pA from A has part pB in B, * which satisfy the conditions: * <pre> * (pA.resourceId == pB.resourceId) and (pB.unitStart <= pA.unitStart) and (pA.unitEnd <= pB.unitEnd) @@ -145,7 +141,7 @@ public class DuplicationsCollector implements Search.Collector { * <pre> * pB.resourceId == pA.resourceId * </pre> - * So this relation is: + * Inclusion is the partial order, thus this relation is: * <ul> * <li>reflexive - A in A</li> * <li>transitive - (A in B) and (B in C) => (A in C)</li> @@ -153,48 +149,17 @@ public class DuplicationsCollector implements Search.Collector { * </ul> * </p> * <p> - * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link #CLONEPART_COMPARATOR}), + * This method uses the fact that all parts already sorted by resourceId and unitStart (see {@link ContainsInComparator#CLONEPART_COMPARATOR}), * so running time - O(|A|+|B|). * </p> */ private static boolean containsIn(CloneGroup first, CloneGroup second) { - // TODO Godin: must prove that this is unused - // if (!first.getOriginPart().getResourceId().equals(second.getOriginPart().getResourceId())) { - // return false; - // } - // if (first.getCloneUnitLength() > second.getCloneUnitLength()) { - // return false; - // } List<ClonePart> firstParts = first.getCloneParts(); List<ClonePart> secondParts = second.getCloneParts(); - return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(first.getCloneUnitLength(), second.getCloneUnitLength())) - && SortedListsUtils.contains(firstParts, secondParts, RESOURCE_ID_COMPARATOR); - } - - private static class ContainsInComparator implements Comparator<ClonePart> { - private final int l1, l2; - - public ContainsInComparator(int l1, int l2) { - this.l1 = l1; - this.l2 = l2; - } - - public int compare(ClonePart o1, ClonePart o2) { - int c = RESOURCE_ID_COMPARATOR.compare(o1, o2); - if (c == 0) { - if (o2.getUnitStart() <= o1.getUnitStart()) { - if (o1.getUnitStart() + l1 <= o2.getUnitStart() + l2) { - return 0; // match found - stop search - } else { - return 1; // continue search - } - } else { - return -1; // o1 < o2 by unitStart - stop search - } - } else { - return c; - } - } + // TODO Godin: according to tests seems that if first part of condition is true, then second part can not be false + // if this can be proved, then second part can be removed + return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(second.getCloneUnitLength(), first.getCloneUnitLength())) + && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR); } } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java index 80d714339d4..d0c32f74765 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/Search.java @@ -24,19 +24,19 @@ import java.util.*; public final class Search { private final SuffixTree tree; - private final int[] lens; + private final TextSet text; private final Collector reporter; private final List<Integer> list = new ArrayList<Integer>(); private final List<Node> innerNodes = new ArrayList<Node>(); public static void perform(TextSet text, Collector reporter) { - new Search(SuffixTree.create(text), text.getLens(), reporter).compute(); + new Search(SuffixTree.create(text), text, reporter).compute(); } - private Search(SuffixTree tree, int[] lens, Collector reporter) { + private Search(SuffixTree tree, TextSet text, Collector reporter) { this.tree = tree; - this.lens = lens; + this.text = text; this.reporter = reporter; } @@ -102,11 +102,17 @@ public final class Search { } } + /** + * TODO Godin: in fact computations here are the same as in {@link #report(Node)}, + * so maybe would be better to remove this duplication, + * however it should be noted that this check can't be done in {@link Collector#endOfGroup()}, + * because it might lead to creation of unnecessary new objects + */ private boolean containsOrigin(Node node) { for (int i = node.startSize; i < node.endSize; i++) { int start = tree.text.length() - list.get(i); int end = start + node.depth; - if (end < lens[0]) { + if (text.isInsideOrigin(end)) { return true; } } @@ -114,23 +120,42 @@ public final class Search { } private void report(Node node) { + reporter.startOfGroup(node.endSize - node.startSize, node.depth); for (int i = node.startSize; i < node.endSize; i++) { int start = tree.text.length() - list.get(i); int end = start + node.depth; - reporter.part(start, end, node.depth); + reporter.part(start, end); } reporter.endOfGroup(); } - public interface Collector { + public static abstract class Collector { /** + * Invoked at the beginning of processing for current node. + * <p> + * Length - is a depth of node. And nodes are visited in descending order of depth, + * thus we guaranty that length will not increase between two sequential calls of this method + * (can be equal or less than previous value). + * </p> + * + * @param size number of parts in group + * @param length length of each part in group + */ + abstract void startOfGroup(int size, int length); + + /** + * Invoked as many times as leafs in the subtree, where current node is root. + * * @param start start position in generalised text * @param end end position in generalised text */ - void part(int start, int end, int len); + abstract void part(int start, int end); - void endOfGroup(); + /** + * Invoked at the end of processing for current node. + */ + abstract void endOfGroup(); } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java index b6e21c940c7..e79137dc10a 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/SuffixTree.java @@ -22,9 +22,24 @@ package org.sonar.duplications.detector.suffixtree; import com.google.common.base.Objects; /** - * The implementation of the algorithm for constructing suffix-tree based on - * <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of + * Provides algorithm to construct suffix tree. + * <p> + * Suffix tree for the string S of length n is defined as a tree such that: + * <ul> + * <li>the paths from the root to the leaves have a one-to-one relationship with the suffixes of S,</li> + * <li>edges spell non-empty strings,</li> + * <li>and all internal nodes (except perhaps the root) have at least two children.</li> + * </ul> + * Since such a tree does not exist for all strings, S is padded with a terminal symbol not seen in the string (usually denoted $). + * This ensures that no suffix is a prefix of another, and that there will be n leaf nodes, one for each of the n suffixes of S. + * Since all internal non-root nodes are branching, there can be at most n − 1 such nodes, and n + (n − 1) + 1 = 2n nodes in total. + * All internal nodes and leafs have incoming edge, so number of edges equal to number of leafs plus number of inner nodes, + * thus at most 2n - 1. + * Construction takes O(n) time. + * </p><p> + * This implementation was adapted from <a href="http://illya-keeplearning.blogspot.com/search/label/suffix%20tree">Java-port</a> of * <a href="http://marknelson.us/1996/08/01/suffix-trees/">Mark Nelson's C++ implementation of Ukkonen's algorithm</a>. + * </p> */ public final class SuffixTree { diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java index 4daccb1776b..033dc879a44 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/TextSet.java @@ -28,28 +28,28 @@ import com.google.common.collect.Lists; /** * Simplifies construction of <a href="http://en.wikipedia.org/wiki/Generalised_suffix_tree">generalised suffix-tree</a>. */ -public class TextSet extends AbstractText { +public final class TextSet extends AbstractText { - public static class Builder { + public static final class Builder { private List<Object> symbols = Lists.newArrayList(); - private List<Integer> sizes = Lists.newArrayList(); + private Integer lengthOfOrigin; + private int count; private Builder() { } public void add(List<Block> list) { symbols.addAll(list); - symbols.add(new Terminator(sizes.size())); - sizes.add(symbols.size()); + symbols.add(new Terminator(count)); + count++; + if (lengthOfOrigin == null) { + lengthOfOrigin = symbols.size(); + } } public TextSet build() { - int[] lens = new int[sizes.size()]; - for (int i = 0; i < sizes.size(); i++) { - lens[i] = sizes.get(i); - } - return new TextSet(symbols, lens); + return new TextSet(symbols, lengthOfOrigin); } } @@ -58,15 +58,15 @@ public class TextSet extends AbstractText { return new Builder(); } - private int[] lens; + private final int lengthOfOrigin; - private TextSet(List<Object> symbols, int[] lens) { + private TextSet(List<Object> symbols, int lengthOfOrigin) { super(symbols); - this.lens = lens; + this.lengthOfOrigin = lengthOfOrigin; } - public int[] getLens() { - return lens; + public boolean isInsideOrigin(int pos) { + return pos < lengthOfOrigin; } @Override diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java b/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java index 39b482e61ae..4efa994c329 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/utils/SortedListsUtils.java @@ -20,6 +20,7 @@ package org.sonar.duplications.utils; import java.util.Comparator; +import java.util.Iterator; import java.util.List; /** @@ -28,32 +29,40 @@ import java.util.List; public final class SortedListsUtils { /** - * Returns true if all elements from second list also presented in first list. - * Both lists must be sorted. - * And both must implement {@link java.util.RandomAccess}, otherwise this method is inefficient in terms of performance. + * Returns true if {@code container} contains all elements from {@code list}. + * Both lists must be sorted in consistency with {@code comparator}, + * that is for any two sequential elements x and y: + * {@code (comparator.compare(x, y) <= 0) && (comparator.compare(y, x) >= 0)}. * Running time - O(|container| + |list|). */ public static <T> boolean contains(List<T> container, List<T> list, Comparator<T> comparator) { - int j = 0; - for (int i = 0; i < list.size(); i++) { - T e1 = list.get(i); - boolean found = false; - for (; j < container.size(); j++) { - T e2 = container.get(j); - int c = comparator.compare(e1, e2); - if (c == 0) { - found = true; - break; - } else if (c < 0) { - // e1 is less than e2 - stop search, because all next elements e2 would be greater than e1 + Iterator<T> listIterator = list.iterator(); + Iterator<T> containerIterator = container.iterator(); + T listElement = listIterator.next(); + T containerElement = containerIterator.next(); + while (true) { + int r = comparator.compare(containerElement, listElement); + if (r == 0) { + // current element from list is equal to current element from container + if (!listIterator.hasNext()) { + // no elements remaining in list - all were matched + return true; + } + // next element from list also can be equal to current element from container + listElement = listIterator.next(); + } else if (r < 0) { + // current element from list is greater than current element from container + // need to check next element from container + if (!containerIterator.hasNext()) { return false; } - } - if (!found) { + containerElement = containerIterator.next(); + } else { + // current element from list is less than current element from container + // stop search, because current element from list would be less than any next element from container return false; } } - return true; } private SortedListsUtils() { diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/ContainsInComparatorTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/ContainsInComparatorTest.java new file mode 100644 index 00000000000..6933d903545 --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/ContainsInComparatorTest.java @@ -0,0 +1,70 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector; + +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.*; + +import java.util.Comparator; + +import org.junit.Test; +import org.sonar.duplications.index.ClonePart; + +public class ContainsInComparatorTest { + + @Test + public void shouldCompareByResourceId() { + Comparator<ClonePart> comparator = ContainsInComparator.RESOURCE_ID_COMPARATOR; + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("b", 0)), is(-1)); + assertThat(comparator.compare(newClonePart("b", 0), newClonePart("a", 0)), is(1)); + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("a", 0)), is(0)); + } + + @Test + public void shouldCompareByResourceIdAndUnitStart() { + Comparator<ClonePart> comparator = ContainsInComparator.CLONEPART_COMPARATOR; + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("b", 0)), is(-1)); + assertThat(comparator.compare(newClonePart("b", 0), newClonePart("a", 0)), is(1)); + + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("a", 0)), is(0)); + assertThat(comparator.compare(newClonePart("a", 0), newClonePart("a", 1)), is(-1)); + assertThat(comparator.compare(newClonePart("a", 1), newClonePart("a", 0)), is(1)); + } + + @Test + public void shouldCompare() { + assertThat(compare("a", 0, 0, "b", 0, 0), is(-1)); + assertThat(compare("b", 0, 0, "a", 0, 0), is(1)); + + assertThat(compare("a", 0, 0, "a", 0, 0), is(0)); + assertThat(compare("a", 1, 0, "a", 0, 0), is(1)); + assertThat(compare("a", 0, 0, "a", 0, 1), is(-1)); + } + + private static int compare(String resourceId1, int start1, int end1, String resourceId2, int start2, int end2) { + return new ContainsInComparator(end1 - start1, end2 - start2) + .compare(newClonePart(resourceId1, start1), newClonePart(resourceId2, start2)); + } + + private static ClonePart newClonePart(String resourceId, int unitStart) { + return new ClonePart(resourceId, unitStart, 0, 0); + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java deleted file mode 100644 index ab4b0b9cd8f..00000000000 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/PrintCollector.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Sonar, open source software quality management tool. - * Copyright (C) 2008-2011 SonarSource - * mailto:contact AT sonarsource DOT com - * - * Sonar is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * Sonar is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Sonar; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 - */ -package org.sonar.duplications.detector.suffixtree; - -import org.sonar.duplications.detector.suffixtree.Search; -import org.sonar.duplications.detector.suffixtree.TextSet; - -public final class PrintCollector implements Search.Collector { - - private final TextSet text; - private int groups; - - public PrintCollector(TextSet text) { - this.text = text; - } - - public void part(int start, int end, int len) { - System.out.println(start + " " + end + " : " + text.sequence(start, end)); - } - - public void endOfGroup() { - groups++; - System.out.println(); - } - - public int getGroups() { - return groups; - } - -} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java index 80daa78225b..78d160c60a3 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTree.java @@ -19,6 +19,9 @@ */ package org.sonar.duplications.detector.suffixtree; +import java.util.LinkedList; +import java.util.Queue; + import org.sonar.duplications.detector.suffixtree.Edge; import org.sonar.duplications.detector.suffixtree.Node; import org.sonar.duplications.detector.suffixtree.SuffixTree; @@ -29,6 +32,9 @@ import com.google.common.base.Objects; public class StringSuffixTree { private final SuffixTree suffixTree; + private int numberOfEdges; + private int numberOfInnerNodes; + private int numberOfLeafs; public static StringSuffixTree create(String text) { return new StringSuffixTree(text); @@ -36,6 +42,34 @@ public class StringSuffixTree { private StringSuffixTree(String text) { suffixTree = SuffixTree.create(new StringText(text)); + + Queue<Node> queue = new LinkedList<Node>(); + queue.add(suffixTree.getRootNode()); + while (!queue.isEmpty()) { + Node node = queue.remove(); + if (node.getEdges().isEmpty()) { + numberOfLeafs++; + } else { + numberOfInnerNodes++; + for (Edge edge : node.getEdges()) { + numberOfEdges++; + queue.add(edge.getEndNode()); + } + } + } + numberOfInnerNodes--; // without root + } + + public int getNumberOfEdges() { + return numberOfEdges; + } + + public int getNumberOfInnerNodes() { + return numberOfInnerNodes; + } + + public int getNumberOfLeafs() { + return numberOfLeafs; } public int indexOf(String str) { diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java deleted file mode 100644 index 7cd796c755f..00000000000 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/StringSuffixTreeTest.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Sonar, open source software quality management tool. - * Copyright (C) 2008-2011 SonarSource - * mailto:contact AT sonarsource DOT com - * - * Sonar is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 3 of the License, or (at your option) any later version. - * - * Sonar is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Sonar; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 - */ -package org.sonar.duplications.detector.suffixtree; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import org.junit.Test; - -public class StringSuffixTreeTest { - - @Test - public void testMississippi() { - StringSuffixTree st = StringSuffixTree.create("mississippi"); - - assertTrue(st.contains("miss")); - assertTrue(st.contains("missis")); - assertTrue(st.contains("pi")); - } - - @Test - public void testBanana() { - StringSuffixTree st = StringSuffixTree.create("banana"); - - assertTrue(st.contains("ana")); - assertTrue(st.contains("an")); - assertTrue(st.contains("na")); - } - - @Test - public void testBook() { - StringSuffixTree st = StringSuffixTree.create("book"); - - assertTrue(st.contains("book")); - assertTrue(st.contains("oo")); - assertTrue(st.contains("ok")); - assertFalse(st.contains("okk")); - assertFalse(st.contains("bookk")); - assertFalse(st.contains("bok")); - - assertEquals(0, st.indexOf("book")); - assertEquals(1, st.indexOf("oo")); - assertEquals(2, st.indexOf("ok")); - } - - @Test - public void testBookke() { - StringSuffixTree st = StringSuffixTree.create("bookke"); - - assertTrue(st.contains("bookk")); - - assertEquals(0, st.indexOf("book")); - assertEquals(1, st.indexOf("oo")); - assertEquals(2, st.indexOf("ok")); - } - - @Test - public void testCacao() { - StringSuffixTree st = StringSuffixTree.create("cacao"); - - assertTrue(st.contains("aca")); - - assertEquals(3, st.indexOf("ao")); - assertEquals(0, st.indexOf("ca")); - assertEquals(2, st.indexOf("cao")); - } - - @Test - public void testGoogol() { - StringSuffixTree st = StringSuffixTree.create("googol"); - - assertTrue(st.contains("oo")); - - assertEquals(0, st.indexOf("go")); - assertEquals(3, st.indexOf("gol")); - assertEquals(1, st.indexOf("oo")); - } - - @Test - public void testAbababc() { - StringSuffixTree st = StringSuffixTree.create("abababc"); - - assertTrue(st.contains("aba")); - - assertEquals(0, st.indexOf("aba")); - assertEquals(4, st.indexOf("abc")); - } -} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java index d094196e528..1a3624dadd5 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeCloneDetectionAlgorithmTest.java @@ -29,10 +29,13 @@ import java.util.List; import org.junit.Test; import org.sonar.duplications.block.Block; +import org.sonar.duplications.block.ByteArray; import org.sonar.duplications.detector.DetectorTestCase; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.CloneIndex; +import com.google.common.collect.Lists; + public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { /** @@ -48,6 +51,35 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { } /** + * See SONAR-3060 + * <p> + * In case when file contains a lot of duplicated blocks suffix-tree works better than original algorithm, + * which works more than 5 minutes for this example. + * </p><p> + * However should be noted that current implementation with suffix-tree also is not optimal, + * even if it works for this example couple of seconds, + * because duplications should be filtered in order to remove fully-covered. + * Moreover - height of the tree grows, depending on the number of blocks, which might lead to StackOverflowError, + * because algorithm uses recursion. + * But such cases nearly never appear in real-world, so current implementation is acceptable for the moment. + * </p> + */ + @Test + public void huge() { + CloneIndex index = createIndex(); + List<Block> fileBlocks = Lists.newArrayList(); + int indexInFile = 0; + for (int i = 0; i < 5000; i++) { + Block block = newBlock("x", new ByteArray("01"), indexInFile); + fileBlocks.add(block); + indexInFile++; + } + List<CloneGroup> result = detect(index, fileBlocks); + + assertEquals(1, result.size()); + } + + /** * Given: * <pre> * x: a 2 b 2 c 2 2 2 @@ -57,6 +89,7 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { * x-x (2 2) * x-x-x-x-x (2) * <pre> + * TODO Godin: however would be better to receive only (2) */ @Test public void myTest() { @@ -80,6 +113,9 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { } /** + * This test and associated with it suffix-tree demonstrates that without filtering in {@link DuplicationsCollector#endOfGroup()} + * possible to construct {@link CloneGroup}, which is fully covered by another {@link CloneGroup}. + * * Given: * <pre> * x: a 2 3 b 2 3 c 2 3 d 2 3 2 3 2 3 @@ -89,6 +125,7 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { * x-x (2 3 2 3) * x-x-x-x-x-x (2 3) * <pre> + * TODO Godin: however would be better to receive only (2 3) */ @Test public void myTest2() { @@ -113,6 +150,9 @@ public class SuffixTreeCloneDetectionAlgorithmTest extends DetectorTestCase { } /** + * This test and associated with it suffix-tree demonstrates that without check of origin in {@link Search} + * possible to construct {@link CloneGroup} with a wrong origin. + * * Given: * <pre> * a: 1 2 3 4 diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeTest.java new file mode 100644 index 00000000000..d1f81e3735b --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/detector/suffixtree/SuffixTreeTest.java @@ -0,0 +1,65 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2011 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.detector.suffixtree; + +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThan; +import static org.junit.Assert.assertThat; + +import java.util.Arrays; +import java.util.Collection; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class SuffixTreeTest { + + @Parameters + public static Collection<Object[]> generateData() { + return Arrays.asList(new Object[][] { { "banana" }, { "mississippi" }, { "book" }, { "bookke" }, { "cacao" }, { "googol" }, { "abababc" }, { "aaaaa" } }); + } + + private final String data; + + public SuffixTreeTest(String data) { + this.data = data; + } + + @Test + public void test() { + String text = this.data + "$"; + StringSuffixTree tree = StringSuffixTree.create(text); + + assertThat("number of leafs", tree.getNumberOfLeafs(), is(text.length())); + assertThat("number of inner nodes", tree.getNumberOfInnerNodes(), lessThan(text.length() - 1)); + assertThat("number of edges", tree.getNumberOfEdges(), is(tree.getNumberOfInnerNodes() + tree.getNumberOfLeafs())); + + for (int beginIndex = 0; beginIndex < text.length(); beginIndex++) { + for (int endIndex = beginIndex + 1; endIndex < text.length() + 1; endIndex++) { + String substring = text.substring(beginIndex, endIndex); + assertThat("index of " + substring + " in " + text, tree.indexOf(substring), is(text.indexOf(substring))); + } + } + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java index cf099af6eab..c37c6feed4d 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/utils/SortedListsUtilsTest.java @@ -32,21 +32,18 @@ public class SortedListsUtilsTest { @Test public void testContains() { - List<Integer> c1 = Arrays.asList(1, 2, 3); - List<Integer> c2 = Arrays.asList(1, 2); - List<Integer> c3 = Arrays.asList(1, 3); + assertThat(contains(Arrays.asList(1, 2, 3), Arrays.asList(1, 2)), is(true)); + assertThat(contains(Arrays.asList(1, 2), Arrays.asList(1, 2, 3)), is(false)); - assertThat(SortedListsUtils.contains(c1, c1, IntegerComparator.INSTANCE), is(true)); - assertThat(SortedListsUtils.contains(c1, c2, IntegerComparator.INSTANCE), is(true)); - assertThat(SortedListsUtils.contains(c1, c3, IntegerComparator.INSTANCE), is(true)); + assertThat(contains(Arrays.asList(1, 2, 3), Arrays.asList(1, 3)), is(true)); + assertThat(contains(Arrays.asList(1, 3), Arrays.asList(1, 2, 3)), is(false)); - assertThat(SortedListsUtils.contains(c2, c1, IntegerComparator.INSTANCE), is(false)); - assertThat(SortedListsUtils.contains(c2, c2, IntegerComparator.INSTANCE), is(true)); - assertThat(SortedListsUtils.contains(c2, c3, IntegerComparator.INSTANCE), is(false)); + assertThat(contains(Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 2, 3)), is(true)); + assertThat(contains(Arrays.asList(1, 2, 2, 3), Arrays.asList(1, 2, 3)), is(true)); + } - assertThat(SortedListsUtils.contains(c3, c1, IntegerComparator.INSTANCE), is(false)); - assertThat(SortedListsUtils.contains(c3, c2, IntegerComparator.INSTANCE), is(false)); - assertThat(SortedListsUtils.contains(c3, c3, IntegerComparator.INSTANCE), is(true)); + private static boolean contains(List<Integer> a, List<Integer> b) { + return SortedListsUtils.contains(a, b, IntegerComparator.INSTANCE); } private static class IntegerComparator implements Comparator<Integer> { |