From 90a6c00024ca6fc4476c6ecf7ab30e44e397240f Mon Sep 17 00:00:00 2001 From: Evgeny Mandrikov Date: Wed, 25 Jan 2012 19:07:33 +0400 Subject: [PATCH] SONAR-3181,SONAR-3139 Allow filtering by number of tokens To do this each block stores two additional integers - index for first and last token. So total size of in-memory storage has been increased. --- .../java/org/sonar/plugins/cpd/PmdEngine.java | 10 +- .../sonar/plugins/cpd/SonarBridgeEngine.java | 21 ++-- .../org/sonar/plugins/cpd/SonarEngine.java | 4 +- .../org/sonar/duplications/CodeFragment.java | 46 ++++++++ .../org/sonar/duplications/block/Block.java | 68 +++++++++++- .../duplications/block/BlockChunker.java | 7 +- .../suffixtree/DuplicationsCollector.java | 79 ++++++++------ .../sonar/duplications/index/CloneGroup.java | 30 +++++- .../sonar/duplications/index/ClonePart.java | 56 ++++++---- .../index/PackedMemoryCloneIndex.java | 33 ++++-- .../internal/pmd/PmdBlockChunker.java | 80 ++++++++++++++ .../internal/pmd/TokenizerBridge.java | 45 ++++---- .../duplications/internal/pmd/TokensLine.java | 69 ++++++++++++ .../duplications/statement/Statement.java | 13 +-- .../org/sonar/duplications/cpd/CPDTest.java | 4 +- .../internal/pmd/PmdBlockChunkerTest.java | 56 ++++++++++ .../internal/pmd/PmdBridgeTest.java | 102 ++++++++++++++++++ .../internal/pmd/TokenizerBridgeTest.java | 65 +++++++---- 18 files changed, 657 insertions(+), 131 deletions(-) create mode 100644 sonar-duplications/src/main/java/org/sonar/duplications/CodeFragment.java create mode 100644 sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/PmdBlockChunker.java create mode 100644 sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokensLine.java create mode 100644 sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBlockChunkerTest.java create mode 100644 sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java diff --git a/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/PmdEngine.java b/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/PmdEngine.java index 08b590c0917..e84f22e36de 100644 --- a/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/PmdEngine.java +++ b/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/PmdEngine.java @@ -19,12 +19,9 @@ */ package org.sonar.plugins.cpd; -import java.io.IOException; -import java.nio.charset.Charset; - +import com.google.common.annotations.VisibleForTesting; import net.sourceforge.pmd.cpd.AbstractLanguage; import net.sourceforge.pmd.cpd.TokenEntry; - import org.apache.commons.configuration.Configuration; import org.sonar.api.CoreProperties; import org.sonar.api.batch.CpdMapping; @@ -33,7 +30,8 @@ import org.sonar.api.resources.Language; import org.sonar.api.resources.Project; import org.sonar.duplications.cpd.CPD; -import com.google.common.annotations.VisibleForTesting; +import java.io.IOException; +import java.nio.charset.Charset; public class PmdEngine extends CpdEngine { @@ -103,7 +101,7 @@ public class PmdEngine extends CpdEngine { } @VisibleForTesting - int getMinimumTokens(Project project) { + static int getMinimumTokens(Project project) { Configuration conf = project.getConfiguration(); return conf.getInt("sonar.cpd." + project.getLanguageKey() + ".minimumTokens", conf.getInt("sonar.cpd.minimumTokens", CoreProperties.CPD_MINIMUM_TOKENS_DEFAULT_VALUE)); diff --git a/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarBridgeEngine.java b/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarBridgeEngine.java index a1cab7980d3..2e736acde3f 100644 --- a/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarBridgeEngine.java +++ b/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarBridgeEngine.java @@ -19,11 +19,12 @@ */ package org.sonar.plugins.cpd; +import com.google.common.base.Predicate; +import com.google.common.collect.Iterables; import org.sonar.api.batch.CpdMapping; import org.sonar.api.batch.SensorContext; import org.sonar.api.resources.*; import org.sonar.duplications.block.Block; -import org.sonar.duplications.block.BlockChunker; import org.sonar.duplications.detector.suffixtree.SuffixTreeCloneDetectionAlgorithm; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.internal.pmd.TokenizerBridge; @@ -35,8 +36,6 @@ import java.util.List; public class SonarBridgeEngine extends CpdEngine { - private static final int BLOCK_SIZE = 10; - private final IndexFactory indexFactory; private final CpdMapping[] mappings; @@ -67,24 +66,32 @@ public class SonarBridgeEngine extends CpdEngine { // Create index SonarDuplicationsIndex index = indexFactory.create(project); - BlockChunker blockChunker = new BlockChunker(BLOCK_SIZE); TokenizerBridge bridge = new TokenizerBridge(mapping.getTokenizer(), fileSystem.getSourceCharset().name()); for (InputFile inputFile : inputFiles) { Resource resource = mapping.createResource(inputFile.getFile(), fileSystem.getSourceDirs()); String resourceId = SonarEngine.getFullKey(project, resource); - List blocks = blockChunker.chunk(resourceId, bridge.tokenize(inputFile.getFile())); + List blocks = bridge.chunk(resourceId, inputFile.getFile()); index.insert(resource, blocks); } - bridge.clearCache(); // Detect + final int minimumTokens = PmdEngine.getMinimumTokens(project); + Predicate minimumTokensPredicate = new Predicate() { + public boolean apply(CloneGroup input) { + return input.getLengthInUnits() >= minimumTokens; + } + }; + for (InputFile inputFile : inputFiles) { Resource resource = mapping.createResource(inputFile.getFile(), fileSystem.getSourceDirs()); String resourceKey = SonarEngine.getFullKey(project, resource); Collection fileBlocks = index.getByResource(resource, resourceKey); List duplications = SuffixTreeCloneDetectionAlgorithm.detect(index, fileBlocks); - SonarEngine.save(context, resource, duplications); + + Iterable filtered = Iterables.filter(duplications, minimumTokensPredicate); + + SonarEngine.save(context, resource, filtered); } } diff --git a/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarEngine.java b/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarEngine.java index 0c1a4345b61..9ae22f814c3 100644 --- a/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarEngine.java +++ b/plugins/sonar-cpd-plugin/src/main/java/org/sonar/plugins/cpd/SonarEngine.java @@ -164,8 +164,8 @@ public class SonarEngine extends CpdEngine { return JavaFile.fromRelativePath(inputFile.getRelativePath(), false); } - static void save(SensorContext context, Resource resource, List clones) { - if (clones == null || clones.isEmpty()) { + static void save(SensorContext context, Resource resource, Iterable clones) { + if (clones == null || !clones.iterator().hasNext()) { return; } // Calculate number of lines and blocks diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/CodeFragment.java b/sonar-duplications/src/main/java/org/sonar/duplications/CodeFragment.java new file mode 100644 index 00000000000..e908f3a5060 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/CodeFragment.java @@ -0,0 +1,46 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2012 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications; + +import com.google.common.annotations.Beta; + +/** + * TODO Enforce contracts of this interface in concrete classes by using preconditions, currently this leads to failures of tests. + * + *

This interface is not intended to be implemented by clients.

+ * + * @since 2.14 + */ +@Beta +public interface CodeFragment { + + /** + * Number of line where fragment starts. + * Numbering starts from 1. + */ + int getStartLine(); + + /** + * Number of line where fragment ends. + * Numbering starts from 1. + */ + int getEndLine(); + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java b/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java index 7a1f659b55d..fb98dd10732 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java @@ -19,11 +19,14 @@ */ package org.sonar.duplications.block; +import com.google.common.annotations.Beta; +import org.sonar.duplications.CodeFragment; + /** * Represents part of source code between two lines. * If two blocks have the same {@link #getBlockHash() hash}, then we assume that there is a duplication in a code, which they represent. */ -public final class Block { +public final class Block implements CodeFragment { private final String resourceId; private final ByteArray blockHash; @@ -64,14 +67,69 @@ public final class Block { return indexInFile; } + private int startUnit; + private int endUnit; + + /** + * @since 2.14 + */ + @Beta + public int getStartUnit() { + return startUnit; + } + + /** + * @since 2.14 + */ + @Beta + public int getEndUnit() { + return endUnit; + } + + /** + * TODO get rid of this method, otherwise class is not immutable + * + * @see #getStartUnit() + * @since 2.14 + */ + @Beta + public void setStartUnit(int startUnit) { + this.startUnit = startUnit; + } + + /** + * TODO get rid of this method, otherwise class is not immutable + * + * @see #getEndUnit() + * @since 2.14 + */ + @Beta + public void setEndUnit(int endUnit) { + this.endUnit = endUnit; + } + + /** + * @deprecated in 2.14, use {@link #getStartLine()} instead + */ public int getFirstLineNumber() { return firstLineNumber; } + /** + * @deprecated in 2.14, use {@link #getEndLine()} instead + */ public int getLastLineNumber() { return lastLineNumber; } + public int getStartLine() { + return firstLineNumber; + } + + public int getEndLine() { + return lastLineNumber; + } + @Override public boolean equals(Object obj) { if (!(obj instanceof Block)) { @@ -79,10 +137,10 @@ public final class Block { } Block other = (Block) obj; return resourceId.equals(other.resourceId) - && blockHash.equals(other.blockHash) - && indexInFile == other.indexInFile - && firstLineNumber == other.firstLineNumber - && lastLineNumber == other.lastLineNumber; + && blockHash.equals(other.blockHash) + && indexInFile == other.indexInFile + && firstLineNumber == other.firstLineNumber + && lastLineNumber == other.lastLineNumber; } @Override diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java b/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java index cbb8db112f9..3abc0742493 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java @@ -19,12 +19,11 @@ */ package org.sonar.duplications.block; -import java.util.Collections; -import java.util.List; - +import com.google.common.collect.Lists; import org.sonar.duplications.statement.Statement; -import com.google.common.collect.Lists; +import java.util.Collections; +import java.util.List; /** * Creates blocks from statements, each block will contain specified number of statements (blockSize) and 64-bits (8-bytes) hash value. diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java index c72d6063ab2..1b9f5f8b164 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java @@ -19,16 +19,15 @@ */ package org.sonar.duplications.detector.suffixtree; -import java.util.Collections; -import java.util.List; - +import com.google.common.collect.Lists; import org.sonar.duplications.block.Block; import org.sonar.duplications.detector.ContainsInComparator; import org.sonar.duplications.index.CloneGroup; import org.sonar.duplications.index.ClonePart; import org.sonar.duplications.utils.SortedListsUtils; -import com.google.common.collect.Lists; +import java.util.Collections; +import java.util.List; /** * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s. @@ -41,8 +40,8 @@ public class DuplicationsCollector extends Search.Collector { private final List filtered = Lists.newArrayList(); private int length; - private ClonePart origin; - private List parts; + private int count; + private int[][] blockNumbers; public DuplicationsCollector(TextSet text) { this.text = text; @@ -58,38 +57,22 @@ public class DuplicationsCollector extends Search.Collector { @Override public void startOfGroup(int size, int length) { + this.blockNumbers = new int[size][2]; this.length = length; - this.parts = Lists.newArrayListWithCapacity(size); } /** * Constructs ClonePart and saves it for future processing in {@link #endOfGroup()}. - * + * * @param start number of first block from text for this part * @param end number of last block from text for this part * @param len number of blocks in this part */ @Override public void part(int start, int end) { - Block firstBlock = text.getBlock(start); - Block lastBlock = text.getBlock(end - 1); - - ClonePart part = new ClonePart( - firstBlock.getResourceId(), - firstBlock.getIndexInFile(), - firstBlock.getFirstLineNumber(), - lastBlock.getLastLineNumber()); - - // TODO Godin: maybe use FastStringComparator here ? - if (originResourceId.equals(part.getResourceId())) { // part from origin - if (origin == null) { - origin = part; - } else if (part.getUnitStart() < origin.getUnitStart()) { - origin = part; - } - } - - parts.add(part); + blockNumbers[count][0] = start; + blockNumbers[count][1] = end - 1; + count++; } /** @@ -97,12 +80,48 @@ public class DuplicationsCollector extends Search.Collector { */ @Override public void endOfGroup() { + int lengthInUnits = 0; + ClonePart origin = null; + List parts = Lists.newArrayListWithCapacity(count); + for (int[] b : blockNumbers) { + Block firstBlock = text.getBlock(b[0]); + Block lastBlock = text.getBlock(b[1]); + ClonePart part = new ClonePart( + firstBlock.getResourceId(), + firstBlock.getIndexInFile(), + firstBlock.getFirstLineNumber(), + lastBlock.getLastLineNumber()); + + // TODO Godin: maybe use FastStringComparator here ? + if (originResourceId.equals(part.getResourceId())) { // part from origin + if (origin == null) { + origin = part; + // To calculate length important to use the origin, because otherwise block may come from DB without required data + lengthInUnits = lastBlock.getEndUnit() - firstBlock.getStartUnit() + 1; + } else if (part.getUnitStart() < origin.getUnitStart()) { + origin = part; + } + } + + parts.add(part); + } + Collections.sort(parts, ContainsInComparator.CLONEPART_COMPARATOR); + CloneGroup group = new CloneGroup(length, origin, parts); + group.setLengthInUnits(lengthInUnits); + filter(group); - parts = null; - origin = null; + reset(); + } + + /** + * Prepare for processing of next duplication. + */ + private void reset() { + blockNumbers = null; + count = 0; } /** @@ -159,7 +178,7 @@ public class DuplicationsCollector extends Search.Collector { // TODO Godin: according to tests seems that if first part of condition is true, then second part can not be false // if this can be proved, then second part can be removed return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(second.getCloneUnitLength(), first.getCloneUnitLength())) - && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR); + && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR); } } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java b/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java index 8c662be34aa..dfb86bb93d6 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java @@ -19,10 +19,11 @@ */ package org.sonar.duplications.index; -import java.util.List; - +import com.google.common.annotations.Beta; import com.google.common.collect.ImmutableList; +import java.util.List; + /** * Groups a set of related {@link ClonePart}s. */ @@ -47,8 +48,31 @@ public class CloneGroup { return originPart; } + private int length; + + /** + * Length of duplication measured in original units, e.g. for token-based detection - in tokens. + * + * @since 2.14 + */ + @Beta + public int getLengthInUnits() { + return length; + } + + /** + * TODO get rid of this method, otherwise class is not immutable + * + * @since 2.14 + * @see #getLengthInUnits() + */ + @Beta + public void setLengthInUnits(int length) { + this.length = length; + } + /** - * @return clone length in units (not in lines) + * @return clone length in {@link org.sonar.duplications.block.Block}s */ public int getCloneUnitLength() { return cloneLength; diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java b/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java index 274b3708ef1..36648b19d25 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java @@ -19,23 +19,25 @@ */ package org.sonar.duplications.index; -public class ClonePart { +import org.sonar.duplications.CodeFragment; + +public class ClonePart implements CodeFragment { private final String resourceId; - private final int unitStart; - private final int lineStart; - private final int lineEnd; + private final int startUnit; + private final int startLine; + private final int endLine; /** * Cache for hash code. */ private int hash; - public ClonePart(String resourceId, int unitStart, int lineStart, int lineEnd) { + public ClonePart(String resourceId, int startUnit, int startLine, int endLine) { this.resourceId = resourceId; - this.unitStart = unitStart; - this.lineStart = lineStart; - this.lineEnd = lineEnd; + this.startUnit = startUnit; + this.startLine = startLine; + this.endLine = endLine; } public String getResourceId() { @@ -43,19 +45,35 @@ public class ClonePart { } public int getUnitStart() { - return unitStart; + return startUnit; } + /** + * @deprecated in 2.14, use {@link #getStartLine()} instead + */ + @Deprecated public int getLineStart() { - return lineStart; + return startLine; } + /** + * @deprecated in 2.14, use {@link #getEndLine()} instead + */ + @Deprecated public int getLineEnd() { - return lineEnd; + return endLine; + } + + public int getStartLine() { + return startLine; + } + + public int getEndLine() { + return endLine; } public int getLines() { - return lineEnd - lineStart + 1; + return endLine - startLine + 1; } @Override @@ -63,9 +81,9 @@ public class ClonePart { if (obj instanceof ClonePart) { ClonePart another = (ClonePart) obj; return another.resourceId.equals(resourceId) - && another.lineStart == lineStart - && another.lineEnd == lineEnd - && another.unitStart == unitStart; + && another.startLine == startLine + && another.endLine == endLine + && another.startUnit == startUnit; } return false; } @@ -75,9 +93,9 @@ public class ClonePart { int h = hash; if (h == 0) { h = resourceId.hashCode(); - h = 31 * h + lineStart; - h = 31 * h + lineEnd; - h = 31 * h + unitStart; + h = 31 * h + startLine; + h = 31 * h + endLine; + h = 31 * h + startUnit; hash = h; } return h; @@ -85,7 +103,7 @@ public class ClonePart { @Override public String toString() { - return "'" + resourceId + "':[" + unitStart + "|" + lineStart + "-" + lineEnd + "]"; + return "'" + resourceId + "':[" + startUnit + "|" + startLine + "-" + endLine + "]"; } } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java b/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java index 4223c8e95a7..cb721a0cc78 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java @@ -19,14 +19,13 @@ */ package org.sonar.duplications.index; -import java.util.Collection; -import java.util.List; - +import com.google.common.collect.Lists; import org.sonar.duplications.block.Block; import org.sonar.duplications.block.ByteArray; import org.sonar.duplications.utils.FastStringComparator; -import com.google.common.collect.Lists; +import java.util.Collection; +import java.util.List; /** * Provides an index optimized by memory. @@ -46,7 +45,7 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex { private static final int DEFAULT_INITIAL_CAPACITY = 1024; - private static final int BLOCK_INTS = 3; + private static final int BLOCK_INTS = 5; private final int hashInts; @@ -111,9 +110,14 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex { } int indexInFile = blockData[offset++]; int firstLineNumber = blockData[offset++]; - int lastLineNumber = blockData[offset]; + int lastLineNumber = blockData[offset++]; + int startUnit = blockData[offset++]; + int endUnit = blockData[offset]; - result.add(new Block(resourceId, new ByteArray(hash), indexInFile, firstLineNumber, lastLineNumber)); + Block block = new Block(resourceId, new ByteArray(hash), indexInFile, firstLineNumber, lastLineNumber); + block.setStartUnit(startUnit); + block.setEndUnit(endUnit); + result.add(block); index++; realIndex = resourceIdsIndex[index]; @@ -146,9 +150,14 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex { offset = index * blockInts + hashInts; int indexInFile = blockData[offset++]; int firstLineNumber = blockData[offset++]; - int lastLineNumber = blockData[offset]; - - result.add(new Block(resourceId, sequenceHash, indexInFile, firstLineNumber, lastLineNumber)); + int lastLineNumber = blockData[offset++]; + int startUnit = blockData[offset++]; + int endUnit = blockData[offset]; + + Block block = new Block(resourceId, sequenceHash, indexInFile, firstLineNumber, lastLineNumber); + block.setStartUnit(startUnit); + block.setEndUnit(endUnit); + result.add(block); index++; } return result; @@ -176,7 +185,9 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex { } blockData[offset++] = block.getIndexInFile(); blockData[offset++] = block.getFirstLineNumber(); - blockData[offset] = block.getLastLineNumber(); + blockData[offset++] = block.getLastLineNumber(); + blockData[offset++] = block.getStartUnit(); + blockData[offset] = block.getEndUnit(); size++; } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/PmdBlockChunker.java b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/PmdBlockChunker.java new file mode 100644 index 00000000000..58713b204f5 --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/PmdBlockChunker.java @@ -0,0 +1,80 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2012 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.internal.pmd; + +import com.google.common.collect.Lists; +import org.sonar.duplications.block.Block; +import org.sonar.duplications.block.BlockChunker; +import org.sonar.duplications.block.ByteArray; + +import java.util.Collections; +import java.util.List; + +/** + * Differences with {@link BlockChunker}: + * works with {@link TokensLine}, + * sets {@link Block#setStartUnit(int)} and {@link Block#setEndUnit(int)} - indexes of first and last token for this block. + */ +public class PmdBlockChunker { + + private static final long PRIME_BASE = 31; + + private final int blockSize; + private final long power; + + public PmdBlockChunker(int blockSize) { + this.blockSize = blockSize; + + long pow = 1; + for (int i = 0; i < blockSize - 1; i++) { + pow = pow * PRIME_BASE; + } + this.power = pow; + } + + public List chunk(String resourceId, List fragments) { + if (fragments.size() < blockSize) { + return Collections.emptyList(); + } + TokensLine[] fragmentsArr = fragments.toArray(new TokensLine[fragments.size()]); + List blocks = Lists.newArrayListWithCapacity(fragmentsArr.length - blockSize + 1); + long hash = 0; + int first = 0; + int last = 0; + for (; last < blockSize - 1; last++) { + hash = hash * PRIME_BASE + fragmentsArr[last].getHashCode(); + } + for (; last < fragmentsArr.length; last++, first++) { + TokensLine firstFragment = fragmentsArr[first]; + TokensLine lastFragment = fragmentsArr[last]; + // add last statement to hash + hash = hash * PRIME_BASE + lastFragment.getHashCode(); + // create block + Block block = new Block(resourceId, new ByteArray(hash), first, firstFragment.getStartLine(), lastFragment.getEndLine()); + block.setStartUnit(firstFragment.getStartUnit()); + block.setEndUnit(lastFragment.getEndUnit()); + blocks.add(block); + // remove first statement from hash + hash -= power * firstFragment.getHashCode(); + } + return blocks; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java index 777b4d53d48..57e84688d76 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java @@ -25,36 +25,45 @@ import net.sourceforge.pmd.cpd.SourceCode; import net.sourceforge.pmd.cpd.TokenEntry; import net.sourceforge.pmd.cpd.Tokenizer; import net.sourceforge.pmd.cpd.Tokens; +import org.sonar.duplications.block.Block; import org.sonar.duplications.cpd.FileCodeLoaderWithoutCache; -import org.sonar.duplications.statement.Statement; import java.io.File; import java.io.IOException; import java.util.List; /** - * Bridge, which allows to convert list of {@link TokenEntry} produced by {@link Tokenizer} into list of {@link Statement}s. - * Principle of conversion - statement formed from tokens of one line. + * Bridge, which allows to convert list of {@link TokenEntry} produced by {@link Tokenizer} into list of {@link TokensLine}s. */ public class TokenizerBridge { + private static final int BLOCK_SIZE = 10; + private final Tokenizer tokenizer; private final String encoding; + private final PmdBlockChunker blockBuilder; public TokenizerBridge(Tokenizer tokenizer, String encoding) { this.tokenizer = tokenizer; this.encoding = encoding; - clearCache(); + this.blockBuilder = new PmdBlockChunker(BLOCK_SIZE); + } + + // TODO remove from here + public List chunk(String resourceId, File file) { + return blockBuilder.chunk(resourceId, chunk(file)); } - public List tokenize(File file) { + public List chunk(File file) { SourceCode sourceCode = new SourceCode(new FileCodeLoaderWithoutCache(file, encoding)); Tokens tokens = new Tokens(); + TokenEntry.clearImages(); try { tokenizer.tokenize(sourceCode, tokens); } catch (IOException e) { throw Throwables.propagate(e); } + TokenEntry.clearImages(); return convert(tokens.getTokens()); } @@ -62,34 +71,34 @@ public class TokenizerBridge { * We expect that implementation of {@link Tokenizer} is correct: * tokens ordered by occurrence in source code and last token is EOF. */ - private static List convert(List tokens) { - ImmutableList.Builder result = ImmutableList.builder(); - int currentLine = Integer.MIN_VALUE; + private static List convert(List tokens) { + ImmutableList.Builder result = ImmutableList.builder(); StringBuilder sb = new StringBuilder(); + int startLine = Integer.MIN_VALUE; + int startIndex = 0; + int currentIndex = 0; for (TokenEntry token : tokens) { if (token != TokenEntry.EOF) { String value = token.getValue(); int line = token.getBeginLine(); - if (line != currentLine) { - addNewStatement(result, currentLine, sb); - currentLine = line; + if (line != startLine) { + addNewTokensLine(result, startIndex, currentIndex, startLine, sb); + startIndex = currentIndex + 1; + startLine = line; } + currentIndex++; sb.append(value); } } - addNewStatement(result, currentLine, sb); + addNewTokensLine(result, startIndex, currentIndex, startLine, sb); return result.build(); } - private static void addNewStatement(ImmutableList.Builder result, int line, StringBuilder sb) { + private static void addNewTokensLine(ImmutableList.Builder result, int startUnit, int endUnit, int startLine, StringBuilder sb) { if (sb.length() != 0) { - result.add(new Statement(line, line, sb.toString())); + result.add(new TokensLine(startUnit, endUnit, startLine, sb.toString().hashCode())); sb.setLength(0); } } - public void clearCache() { - TokenEntry.clearImages(); - } - } diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokensLine.java b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokensLine.java new file mode 100644 index 00000000000..2f832bb50af --- /dev/null +++ b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokensLine.java @@ -0,0 +1,69 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2012 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.internal.pmd; + +import com.google.common.base.Preconditions; +import org.sonar.duplications.CodeFragment; + +/** + * Immutable code fragment, which formed from tokens of one line. + */ +class TokensLine implements CodeFragment { + + private final int startLine; + private final int hashCode; + + private final int startUnit; + private final int endUnit; + + public TokensLine(int startUnit, int endUnit, int startLine, int hashCode) { + Preconditions.checkArgument(startLine > 0); + // TODO do we have requirements for length and hashcode ? + this.startLine = startLine; + this.hashCode = hashCode; + + this.startUnit = startUnit; + this.endUnit = endUnit; + } + + public int getStartLine() { + return startLine; + } + + /** + * Same as {@link #getStartLine()} + */ + public int getEndLine() { + return startLine; + } + + public int getHashCode() { + return hashCode; + } + + public int getStartUnit() { + return startUnit; + } + + public int getEndUnit() { + return endUnit; + } + +} diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java b/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java index a154792dfb5..7f8a9c473b5 100644 --- a/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java +++ b/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java @@ -19,11 +19,12 @@ */ package org.sonar.duplications.statement; -import java.util.List; - +import org.sonar.duplications.CodeFragment; import org.sonar.duplications.token.Token; -public class Statement { +import java.util.List; + +public class Statement implements CodeFragment { private final int startLine; private final int endLine; @@ -84,8 +85,8 @@ public class Statement { } Statement other = (Statement) obj; return startLine == other.startLine - && endLine == other.endLine - && value.equals(other.value); + && endLine == other.endLine + && value.equals(other.value); } @Override @@ -93,4 +94,4 @@ public class Statement { return "[" + getStartLine() + "-" + getEndLine() + "] [" + getValue() + "]"; } -} \ No newline at end of file +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java index d825084dacf..c7133c4387a 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java @@ -57,8 +57,9 @@ public class CPDTest { assertThat(match.getLineCount(), is(26)); assertThat(match.getFirstMark().getBeginLine(), is(16)); assertThat(match.getSourceCodeSlice(), is(nullValue())); + assertThat(match.getTokenCount(), is(116)); } - + @Test public void testDuplicationOnSameFile() throws IOException { TokenEntry.clearImages(); @@ -77,6 +78,7 @@ public class CPDTest { assertThat(match.getLineCount(), is(16)); assertThat(match.getFirstMark().getBeginLine(), is(29)); assertThat(match.getSourceCodeSlice(), is(nullValue())); + assertThat(match.getTokenCount(), is(160)); } private List getMatches(CPD cpd) { diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBlockChunkerTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBlockChunkerTest.java new file mode 100644 index 00000000000..50b17be524c --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBlockChunkerTest.java @@ -0,0 +1,56 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2012 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.internal.pmd; + +import org.junit.Test; +import org.sonar.duplications.block.Block; +import org.sonar.duplications.block.ByteArray; + +import java.util.Arrays; +import java.util.List; + +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.assertThat; + +public class PmdBlockChunkerTest { + + @Test + public void shouldBuildBlocks() { + TokensLine line1 = new TokensLine(0, 9, 1, 1); + TokensLine line2 = new TokensLine(10, 19, 2, 2); + TokensLine line3 = new TokensLine(20, 29, 3, 3); + + List blocks = new PmdBlockChunker(2).chunk("resourceId", Arrays.asList(line1, line2, line3)); + assertThat(blocks.size(), is(2)); + + Block block = blocks.get(0); + // assertThat(block.getLengthInUnits(), is(11)); + assertThat(block.getStartLine(), is(1)); + assertThat(block.getEndLine(), is(2)); + assertThat(block.getBlockHash(), is(new ByteArray(1L * 31 + 2))); + + block = blocks.get(1); + // assertThat(block.getLengthInUnits(), is(33)); + assertThat(block.getStartLine(), is(2)); + assertThat(block.getEndLine(), is(3)); + assertThat(block.getBlockHash(), is(new ByteArray(2L * 31 + 3))); + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java new file mode 100644 index 00000000000..60c22a51f85 --- /dev/null +++ b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java @@ -0,0 +1,102 @@ +/* + * Sonar, open source software quality management tool. + * Copyright (C) 2008-2012 SonarSource + * mailto:contact AT sonarsource DOT com + * + * Sonar is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * Sonar is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Sonar; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02 + */ +package org.sonar.duplications.internal.pmd; + +import net.sourceforge.pmd.cpd.JavaTokenizer; +import org.junit.Before; +import org.junit.Test; +import org.sonar.duplications.block.Block; +import org.sonar.duplications.detector.suffixtree.SuffixTreeCloneDetectionAlgorithm; +import org.sonar.duplications.index.CloneGroup; +import org.sonar.duplications.index.CloneIndex; +import org.sonar.duplications.index.ClonePart; +import org.sonar.duplications.index.PackedMemoryCloneIndex; + +import java.io.File; +import java.util.Collection; +import java.util.List; + +import static org.hamcrest.Matchers.is; +import static org.junit.Assert.assertThat; + +public class PmdBridgeTest { + + private CloneIndex index; + private TokenizerBridge bridge; + + @Before + public void setUp() { + index = new PackedMemoryCloneIndex(); + bridge = new TokenizerBridge(new JavaTokenizer(), "UTF-8"); + } + + @Test + public void testDuplicationInSingleFile() { + File file = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile3.java"); + addToIndex(file); + + List duplications = detect(file); + assertThat(duplications.size(), is(1)); + + CloneGroup duplication = duplications.get(0); + assertThat(duplication.getOriginPart().getResourceId(), is(file.getAbsolutePath())); + assertThat(duplication.getCloneParts().size(), is(2)); + assertThat("length in tokens", duplication.getLengthInUnits(), is(157)); + + ClonePart part = duplication.getCloneParts().get(0); + assertThat(part.getResourceId(), is(file.getAbsolutePath())); + assertThat(part.getStartLine(), is(30)); + assertThat(part.getEndLine(), is(44)); + } + + @Test + public void testDuplicationBetweenTwoFiles() { + File file1 = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile1.java"); + File file2 = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile2.java"); + addToIndex(file1); + addToIndex(file2); + + List duplications = detect(file1); + assertThat(duplications.size(), is(1)); + + CloneGroup duplication = duplications.get(0); + assertThat(duplication.getOriginPart().getResourceId(), is(file1.getAbsolutePath())); + assertThat(duplication.getCloneParts().size(), is(2)); + assertThat("length in tokens", duplication.getLengthInUnits(), is(115)); + + ClonePart part = duplication.getCloneParts().get(0); + assertThat(part.getResourceId(), is(file1.getAbsolutePath())); + assertThat(part.getStartLine(), is(18)); + assertThat(part.getEndLine(), is(41)); + } + + private List detect(File file) { + Collection fileBlocks = index.getByResourceId(file.getAbsolutePath()); + return SuffixTreeCloneDetectionAlgorithm.detect(index, fileBlocks); + } + + private void addToIndex(File file) { + List blocks = bridge.chunk(file.getAbsolutePath(), file); + for (Block block : blocks) { + index.insert(block); + } + } + +} diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java index 0d56c5f2da3..ec2bd5d0d9b 100644 --- a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java +++ b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java @@ -23,8 +23,8 @@ import net.sourceforge.pmd.cpd.SourceCode; import net.sourceforge.pmd.cpd.TokenEntry; import net.sourceforge.pmd.cpd.Tokenizer; import net.sourceforge.pmd.cpd.Tokens; +import org.junit.Before; import org.junit.Test; -import org.sonar.duplications.statement.Statement; import java.io.IOException; import java.util.List; @@ -34,8 +34,10 @@ import static org.junit.Assert.assertThat; public class TokenizerBridgeTest { - @Test - public void test() { + private TokenizerBridge bridge; + + @Before + public void setUp() { Tokenizer tokenizer = new Tokenizer() { public void tokenize(SourceCode tokens, Tokens tokenEntries) throws IOException { tokenEntries.add(new TokenEntry("t1", "src", 1)); @@ -43,30 +45,55 @@ public class TokenizerBridgeTest { tokenEntries.add(new TokenEntry("t3", "src", 2)); tokenEntries.add(new TokenEntry("t1", "src", 4)); tokenEntries.add(new TokenEntry("t3", "src", 4)); + tokenEntries.add(new TokenEntry("t3", "src", 4)); tokenEntries.add(TokenEntry.getEOF()); } }; + bridge = new TokenizerBridge(tokenizer, "UTF-8"); + } + + @Test + public void shouldClearCacheInTokenEntry() { + bridge.chunk(null); + TokenEntry token = new TokenEntry("image", "srcId", 0); + assertThat(token.getIndex(), is(0)); + assertThat(token.getIdentifier(), is(1)); + } + + @Test + public void test() { + // To be sure that token index will be relative to file - run twice: + bridge.chunk(null); + List lines = bridge.chunk(null); + + assertThat(lines.size(), is(3)); + + TokensLine line = lines.get(0); + // 2 tokens on 1 line + assertThat(line.getStartUnit(), is(1)); + assertThat(line.getEndUnit(), is(2)); - TokenizerBridge bridge = new TokenizerBridge(tokenizer, "UTF-8"); - List statements = bridge.tokenize(null); - bridge.clearCache(); + assertThat(line.getStartLine(), is(1)); + assertThat(line.getEndLine(), is(1)); + assertThat(line.getHashCode(), is("t1t2".hashCode())); - assertThat(statements.size(), is(3)); + line = lines.get(1); + // 1 token on 2 line + assertThat(line.getStartUnit(), is(3)); + assertThat(line.getEndUnit(), is(3)); - Statement statement = statements.get(0); - assertThat(statement.getStartLine(), is(1)); - assertThat(statement.getEndLine(), is(1)); - assertThat(statement.getValue(), is("t1t2")); + assertThat(line.getStartLine(), is(2)); + assertThat(line.getEndLine(), is(2)); + assertThat(line.getHashCode(), is("t3".hashCode())); - statement = statements.get(1); - assertThat(statement.getStartLine(), is(2)); - assertThat(statement.getEndLine(), is(2)); - assertThat(statement.getValue(), is("t3")); + line = lines.get(2); + // 3 tokens on 4 line + assertThat(line.getStartUnit(), is(4)); + assertThat(line.getEndUnit(), is(6)); - statement = statements.get(2); - assertThat(statement.getStartLine(), is(4)); - assertThat(statement.getEndLine(), is(4)); - assertThat(statement.getValue(), is("t1t3")); + assertThat(line.getStartLine(), is(4)); + assertThat(line.getEndLine(), is(4)); + assertThat(line.getHashCode(), is("t1t3t3".hashCode())); } } -- 2.39.5