SONAR-3181,SONAR-3139 Allow filtering by number of tokens

To do this each block stores two additional integers - index for first and last token. So total size of in-memory storage has been increased.
author: Evgeny Mandrikov <mandrikov@gmail.com> 2012-01-25 19:07:33 +0400
committer: Evgeny Mandrikov <mandrikov@gmail.com> 2012-01-26 13:58:50 +0400
commit: 90a6c00024ca6fc4476c6ecf7ab30e44e397240f (patch)
tree: ab3476c8b19b4d0bdce94c7119a230c56ef9cebf /sonar-duplications
parent: f185818faa11af412a75e33bf844bd3f21929e03 (diff)
download: sonarqube-90a6c00024ca6fc4476c6ecf7ab30e44e397240f.tar.gz
sonarqube-90a6c00024ca6fc4476c6ecf7ab30e44e397240f.zip
15 files changed, 637 insertions, 116 deletions
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/CodeFragment.java b/sonar-duplications/src/main/java/org/sonar/duplications/CodeFragment.java
new file mode 100644
index 00000000000..e908f3a5060
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/CodeFragment.java
@@ -0,0 +1,46 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2012 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
+ */
+package org.sonar.duplications;
+
+import com.google.common.annotations.Beta;
+
+/**
+ * TODO Enforce contracts of this interface in concrete classes by using preconditions, currently this leads to failures of tests.
+ *
+ * <p>This interface is not intended to be implemented by clients.</p>
+ *
+ * @since 2.14
+ */
+@Beta
+public interface CodeFragment {
+
+  /**
+   * Number of line where fragment starts.
+   * Numbering starts from 1.
+   */
+  int getStartLine();
+
+  /**
+   * Number of line where fragment ends.
+   * Numbering starts from 1.
+   */
+  int getEndLine();
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java b/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java
index 7a1f659b55d..fb98dd10732 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/block/Block.java
@@ -19,11 +19,14 @@
  */
 package org.sonar.duplications.block;
 
+import com.google.common.annotations.Beta;
+import org.sonar.duplications.CodeFragment;
+
 /**
  * Represents part of source code between two lines.
  * If two blocks have the same {@link #getBlockHash() hash}, then we assume that there is a duplication in a code, which they represent.
  */
-public final class Block {
+public final class Block implements CodeFragment {
 
   private final String resourceId;
   private final ByteArray blockHash;
@@ -64,14 +67,69 @@ public final class Block {
     return indexInFile;
   }
 
+  private int startUnit;
+  private int endUnit;
+
+  /**
+   * @since 2.14
+   */
+  @Beta
+  public int getStartUnit() {
+    return startUnit;
+  }
+
+  /**
+   * @since 2.14
+   */
+  @Beta
+  public int getEndUnit() {
+    return endUnit;
+  }
+
+  /**
+   * TODO get rid of this method, otherwise class is not immutable
+   *
+   * @see #getStartUnit()
+   * @since 2.14
+   */
+  @Beta
+  public void setStartUnit(int startUnit) {
+    this.startUnit = startUnit;
+  }
+
+  /**
+   * TODO get rid of this method, otherwise class is not immutable
+   *
+   * @see #getEndUnit()
+   * @since 2.14
+   */
+  @Beta
+  public void setEndUnit(int endUnit) {
+    this.endUnit = endUnit;
+  }
+
+  /**
+   * @deprecated in 2.14, use {@link #getStartLine()} instead
+   */
   public int getFirstLineNumber() {
     return firstLineNumber;
   }
 
+  /**
+   * @deprecated in 2.14, use {@link #getEndLine()} instead
+   */
   public int getLastLineNumber() {
     return lastLineNumber;
   }
 
+  public int getStartLine() {
+    return firstLineNumber;
+  }
+
+  public int getEndLine() {
+    return lastLineNumber;
+  }
+
   @Override
   public boolean equals(Object obj) {
     if (!(obj instanceof Block)) {
@@ -79,10 +137,10 @@ public final class Block {
     }
     Block other = (Block) obj;
     return resourceId.equals(other.resourceId)
-        && blockHash.equals(other.blockHash)
-        && indexInFile == other.indexInFile
-        && firstLineNumber == other.firstLineNumber
-        && lastLineNumber == other.lastLineNumber;
+      && blockHash.equals(other.blockHash)
+      && indexInFile == other.indexInFile
+      && firstLineNumber == other.firstLineNumber
+      && lastLineNumber == other.lastLineNumber;
   }
 
   @Override
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java b/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java
index cbb8db112f9..3abc0742493 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/block/BlockChunker.java
@@ -19,12 +19,11 @@
  */
 package org.sonar.duplications.block;
 
-import java.util.Collections;
-import java.util.List;
-
+import com.google.common.collect.Lists;
 import org.sonar.duplications.statement.Statement;
 
-import com.google.common.collect.Lists;
+import java.util.Collections;
+import java.util.List;
 
 /**
  * Creates blocks from statements, each block will contain specified number of statements (<code>blockSize</code>) and 64-bits (8-bytes) hash value.
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java
index c72d6063ab2..1b9f5f8b164 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/detector/suffixtree/DuplicationsCollector.java
@@ -19,16 +19,15 @@
  */
 package org.sonar.duplications.detector.suffixtree;
 
-import java.util.Collections;
-import java.util.List;
-
+import com.google.common.collect.Lists;
 import org.sonar.duplications.block.Block;
 import org.sonar.duplications.detector.ContainsInComparator;
 import org.sonar.duplications.index.CloneGroup;
 import org.sonar.duplications.index.ClonePart;
 import org.sonar.duplications.utils.SortedListsUtils;
 
-import com.google.common.collect.Lists;
+import java.util.Collections;
+import java.util.List;
 
 /**
  * Implementation of {@link Search.Collector}, which constructs {@link CloneGroup}s.
@@ -41,8 +40,8 @@ public class DuplicationsCollector extends Search.Collector {
   private final List<CloneGroup> filtered = Lists.newArrayList();
 
   private int length;
-  private ClonePart origin;
-  private List<ClonePart> parts;
+  private int count;
+  private int[][] blockNumbers;
 
   public DuplicationsCollector(TextSet text) {
     this.text = text;
@@ -58,38 +57,22 @@ public class DuplicationsCollector extends Search.Collector {
 
   @Override
   public void startOfGroup(int size, int length) {
+    this.blockNumbers = new int[size][2];
     this.length = length;
-    this.parts = Lists.newArrayListWithCapacity(size);
   }
 
   /**
    * Constructs ClonePart and saves it for future processing in {@link #endOfGroup()}.
-   * 
+   *
    * @param start number of first block from text for this part
    * @param end number of last block from text for this part
    * @param len number of blocks in this part
    */
   @Override
   public void part(int start, int end) {
-    Block firstBlock = text.getBlock(start);
-    Block lastBlock = text.getBlock(end - 1);
-
-    ClonePart part = new ClonePart(
-        firstBlock.getResourceId(),
-        firstBlock.getIndexInFile(),
-        firstBlock.getFirstLineNumber(),
-        lastBlock.getLastLineNumber());
-
-    // TODO Godin: maybe use FastStringComparator here ?
-    if (originResourceId.equals(part.getResourceId())) { // part from origin
-      if (origin == null) {
-        origin = part;
-      } else if (part.getUnitStart() < origin.getUnitStart()) {
-        origin = part;
-      }
-    }
-
-    parts.add(part);
+    blockNumbers[count][0] = start;
+    blockNumbers[count][1] = end - 1;
+    count++;
   }
 
   /**
@@ -97,12 +80,48 @@ public class DuplicationsCollector extends Search.Collector {
    */
   @Override
   public void endOfGroup() {
+    int lengthInUnits = 0;
+    ClonePart origin = null;
+    List<ClonePart> parts = Lists.newArrayListWithCapacity(count);
+    for (int[] b : blockNumbers) {
+      Block firstBlock = text.getBlock(b[0]);
+      Block lastBlock = text.getBlock(b[1]);
+      ClonePart part = new ClonePart(
+          firstBlock.getResourceId(),
+          firstBlock.getIndexInFile(),
+          firstBlock.getFirstLineNumber(),
+          lastBlock.getLastLineNumber());
+
+      // TODO Godin: maybe use FastStringComparator here ?
+      if (originResourceId.equals(part.getResourceId())) { // part from origin
+        if (origin == null) {
+          origin = part;
+          // To calculate length important to use the origin, because otherwise block may come from DB without required data
+          lengthInUnits = lastBlock.getEndUnit() - firstBlock.getStartUnit() + 1;
+        } else if (part.getUnitStart() < origin.getUnitStart()) {
+          origin = part;
+        }
+      }
+
+      parts.add(part);
+    }
+
     Collections.sort(parts, ContainsInComparator.CLONEPART_COMPARATOR);
+
     CloneGroup group = new CloneGroup(length, origin, parts);
+    group.setLengthInUnits(lengthInUnits);
+
     filter(group);
 
-    parts = null;
-    origin = null;
+    reset();
+  }
+
+  /**
+   * Prepare for processing of next duplication.
+   */
+  private void reset() {
+    blockNumbers = null;
+    count = 0;
   }
 
   /**
@@ -159,7 +178,7 @@ public class DuplicationsCollector extends Search.Collector {
     // TODO Godin: according to tests seems that if first part of condition is true, then second part can not be false
     // if this can be proved, then second part can be removed
     return SortedListsUtils.contains(secondParts, firstParts, new ContainsInComparator(second.getCloneUnitLength(), first.getCloneUnitLength()))
-        && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR);
+      && SortedListsUtils.contains(firstParts, secondParts, ContainsInComparator.RESOURCE_ID_COMPARATOR);
   }
 
 }
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java b/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java
index 8c662be34aa..dfb86bb93d6 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/index/CloneGroup.java
@@ -19,10 +19,11 @@
  */
 package org.sonar.duplications.index;
 
-import java.util.List;
-
+import com.google.common.annotations.Beta;
 import com.google.common.collect.ImmutableList;
 
+import java.util.List;
+
 /**
  * Groups a set of related {@link ClonePart}s.
  */
@@ -47,8 +48,31 @@ public class CloneGroup {
     return originPart;
   }
 
+  private int length;
+
+  /**
+   * Length of duplication measured in original units, e.g. for token-based detection - in tokens.
+   *
+   * @since 2.14
+   */
+  @Beta
+  public int getLengthInUnits() {
+    return length;
+  }
+
+  /**
+   * TODO get rid of this method, otherwise class is not immutable
+   *
+   * @since 2.14
+   * @see #getLengthInUnits()
+   */
+  @Beta
+  public void setLengthInUnits(int length) {
+    this.length = length;
+  }
+
   /**
-   * @return clone length in units (not in lines)
+   * @return clone length in {@link org.sonar.duplications.block.Block}s
    */
   public int getCloneUnitLength() {
     return cloneLength;
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java b/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java
index 274b3708ef1..36648b19d25 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/index/ClonePart.java
@@ -19,23 +19,25 @@
  */
 package org.sonar.duplications.index;
 
-public class ClonePart {
+import org.sonar.duplications.CodeFragment;
+
+public class ClonePart implements CodeFragment {
 
   private final String resourceId;
-  private final int unitStart;
-  private final int lineStart;
-  private final int lineEnd;
+  private final int startUnit;
+  private final int startLine;
+  private final int endLine;
 
   /**
    * Cache for hash code.
    */
   private int hash;
 
-  public ClonePart(String resourceId, int unitStart, int lineStart, int lineEnd) {
+  public ClonePart(String resourceId, int startUnit, int startLine, int endLine) {
     this.resourceId = resourceId;
-    this.unitStart = unitStart;
-    this.lineStart = lineStart;
-    this.lineEnd = lineEnd;
+    this.startUnit = startUnit;
+    this.startLine = startLine;
+    this.endLine = endLine;
   }
 
   public String getResourceId() {
@@ -43,19 +45,35 @@ public class ClonePart {
   }
 
   public int getUnitStart() {
-    return unitStart;
+    return startUnit;
   }
 
+  /**
+   * @deprecated in 2.14, use {@link #getStartLine()} instead
+   */
+  @Deprecated
   public int getLineStart() {
-    return lineStart;
+    return startLine;
   }
 
+  /**
+   * @deprecated in 2.14, use {@link #getEndLine()} instead
+   */
+  @Deprecated
   public int getLineEnd() {
-    return lineEnd;
+    return endLine;
+  }
+
+  public int getStartLine() {
+    return startLine;
+  }
+
+  public int getEndLine() {
+    return endLine;
   }
 
   public int getLines() {
-    return lineEnd - lineStart + 1;
+    return endLine - startLine + 1;
   }
 
   @Override
@@ -63,9 +81,9 @@ public class ClonePart {
     if (obj instanceof ClonePart) {
       ClonePart another = (ClonePart) obj;
       return another.resourceId.equals(resourceId)
-          && another.lineStart == lineStart
-          && another.lineEnd == lineEnd
-          && another.unitStart == unitStart;
+        && another.startLine == startLine
+        && another.endLine == endLine
+        && another.startUnit == startUnit;
     }
     return false;
   }
@@ -75,9 +93,9 @@ public class ClonePart {
     int h = hash;
     if (h == 0) {
       h = resourceId.hashCode();
-      h = 31 * h + lineStart;
-      h = 31 * h + lineEnd;
-      h = 31 * h + unitStart;
+      h = 31 * h + startLine;
+      h = 31 * h + endLine;
+      h = 31 * h + startUnit;
       hash = h;
     }
     return h;
@@ -85,7 +103,7 @@ public class ClonePart {
 
   @Override
   public String toString() {
-    return "'" + resourceId + "':[" + unitStart + "|" + lineStart + "-" + lineEnd + "]";
+    return "'" + resourceId + "':[" + startUnit + "|" + startLine + "-" + endLine + "]";
   }
 
 }
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java b/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java
index 4223c8e95a7..cb721a0cc78 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/index/PackedMemoryCloneIndex.java
@@ -19,14 +19,13 @@
  */
 package org.sonar.duplications.index;
 
-import java.util.Collection;
-import java.util.List;
-
+import com.google.common.collect.Lists;
 import org.sonar.duplications.block.Block;
 import org.sonar.duplications.block.ByteArray;
 import org.sonar.duplications.utils.FastStringComparator;
 
-import com.google.common.collect.Lists;
+import java.util.Collection;
+import java.util.List;
 
 /**
  * Provides an index optimized by memory.
@@ -46,7 +45,7 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex {
 
   private static final int DEFAULT_INITIAL_CAPACITY = 1024;
 
-  private static final int BLOCK_INTS = 3;
+  private static final int BLOCK_INTS = 5;
 
   private final int hashInts;
 
@@ -111,9 +110,14 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex {
       }
       int indexInFile = blockData[offset++];
       int firstLineNumber = blockData[offset++];
-      int lastLineNumber = blockData[offset];
+      int lastLineNumber = blockData[offset++];
+      int startUnit = blockData[offset++];
+      int endUnit = blockData[offset];
 
-      result.add(new Block(resourceId, new ByteArray(hash), indexInFile, firstLineNumber, lastLineNumber));
+      Block block = new Block(resourceId, new ByteArray(hash), indexInFile, firstLineNumber, lastLineNumber);
+      block.setStartUnit(startUnit);
+      block.setEndUnit(endUnit);
+      result.add(block);
 
       index++;
       realIndex = resourceIdsIndex[index];
@@ -146,9 +150,14 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex {
       offset = index * blockInts + hashInts;
       int indexInFile = blockData[offset++];
       int firstLineNumber = blockData[offset++];
-      int lastLineNumber = blockData[offset];
-
-      result.add(new Block(resourceId, sequenceHash, indexInFile, firstLineNumber, lastLineNumber));
+      int lastLineNumber = blockData[offset++];
+      int startUnit = blockData[offset++];
+      int endUnit = blockData[offset];
+
+      Block block = new Block(resourceId, sequenceHash, indexInFile, firstLineNumber, lastLineNumber);
+      block.setStartUnit(startUnit);
+      block.setEndUnit(endUnit);
+      result.add(block);
       index++;
     }
     return result;
@@ -176,7 +185,9 @@ public class PackedMemoryCloneIndex extends AbstractCloneIndex {
     }
     blockData[offset++] = block.getIndexInFile();
     blockData[offset++] = block.getFirstLineNumber();
-    blockData[offset] = block.getLastLineNumber();
+    blockData[offset++] = block.getLastLineNumber();
+    blockData[offset++] = block.getStartUnit();
+    blockData[offset] = block.getEndUnit();
 
     size++;
   }
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/PmdBlockChunker.java b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/PmdBlockChunker.java
new file mode 100644
index 00000000000..58713b204f5
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/PmdBlockChunker.java
@@ -0,0 +1,80 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2012 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
+ */
+package org.sonar.duplications.internal.pmd;
+
+import com.google.common.collect.Lists;
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.block.BlockChunker;
+import org.sonar.duplications.block.ByteArray;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Differences with {@link BlockChunker}:
+ * works with {@link TokensLine},
+ * sets {@link Block#setStartUnit(int)} and {@link Block#setEndUnit(int)} - indexes of first and last token for this block.
+ */
+public class PmdBlockChunker {
+
+  private static final long PRIME_BASE = 31;
+
+  private final int blockSize;
+  private final long power;
+
+  public PmdBlockChunker(int blockSize) {
+    this.blockSize = blockSize;
+
+    long pow = 1;
+    for (int i = 0; i < blockSize - 1; i++) {
+      pow = pow * PRIME_BASE;
+    }
+    this.power = pow;
+  }
+
+  public List<Block> chunk(String resourceId, List<TokensLine> fragments) {
+    if (fragments.size() < blockSize) {
+      return Collections.emptyList();
+    }
+    TokensLine[] fragmentsArr = fragments.toArray(new TokensLine[fragments.size()]);
+    List<Block> blocks = Lists.newArrayListWithCapacity(fragmentsArr.length - blockSize + 1);
+    long hash = 0;
+    int first = 0;
+    int last = 0;
+    for (; last < blockSize - 1; last++) {
+      hash = hash * PRIME_BASE + fragmentsArr[last].getHashCode();
+    }
+    for (; last < fragmentsArr.length; last++, first++) {
+      TokensLine firstFragment = fragmentsArr[first];
+      TokensLine lastFragment = fragmentsArr[last];
+      // add last statement to hash
+      hash = hash * PRIME_BASE + lastFragment.getHashCode();
+      // create block
+      Block block = new Block(resourceId, new ByteArray(hash), first, firstFragment.getStartLine(), lastFragment.getEndLine());
+      block.setStartUnit(firstFragment.getStartUnit());
+      block.setEndUnit(lastFragment.getEndUnit());
+      blocks.add(block);
+      // remove first statement from hash
+      hash -= power * firstFragment.getHashCode();
+    }
+    return blocks;
+  }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java
index 777b4d53d48..57e84688d76 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java
@@ -25,36 +25,45 @@ import net.sourceforge.pmd.cpd.SourceCode;
 import net.sourceforge.pmd.cpd.TokenEntry;
 import net.sourceforge.pmd.cpd.Tokenizer;
 import net.sourceforge.pmd.cpd.Tokens;
+import org.sonar.duplications.block.Block;
 import org.sonar.duplications.cpd.FileCodeLoaderWithoutCache;
-import org.sonar.duplications.statement.Statement;
 
 import java.io.File;
 import java.io.IOException;
 import java.util.List;
 
 /**
- * Bridge, which allows to convert list of {@link TokenEntry} produced by {@link Tokenizer} into list of {@link Statement}s.
- * Principle of conversion - statement formed from tokens of one line.
+ * Bridge, which allows to convert list of {@link TokenEntry} produced by {@link Tokenizer} into list of {@link TokensLine}s.
  */
 public class TokenizerBridge {
 
+  private static final int BLOCK_SIZE = 10;
+
   private final Tokenizer tokenizer;
   private final String encoding;
+  private final PmdBlockChunker blockBuilder;
 
   public TokenizerBridge(Tokenizer tokenizer, String encoding) {
     this.tokenizer = tokenizer;
     this.encoding = encoding;
-    clearCache();
+    this.blockBuilder = new PmdBlockChunker(BLOCK_SIZE);
+  }
+
+  // TODO remove from here
+  public List<Block> chunk(String resourceId, File file) {
+    return blockBuilder.chunk(resourceId, chunk(file));
   }
 
-  public List<Statement> tokenize(File file) {
+  public List<TokensLine> chunk(File file) {
     SourceCode sourceCode = new SourceCode(new FileCodeLoaderWithoutCache(file, encoding));
     Tokens tokens = new Tokens();
+    TokenEntry.clearImages();
     try {
       tokenizer.tokenize(sourceCode, tokens);
     } catch (IOException e) {
       throw Throwables.propagate(e);
     }
+    TokenEntry.clearImages();
     return convert(tokens.getTokens());
   }
 
@@ -62,34 +71,34 @@ public class TokenizerBridge {
    * We expect that implementation of {@link Tokenizer} is correct:
    * tokens ordered by occurrence in source code and last token is EOF.
    */
-  private static List<Statement> convert(List<TokenEntry> tokens) {
-    ImmutableList.Builder<Statement> result = ImmutableList.builder();
-    int currentLine = Integer.MIN_VALUE;
+  private static List<TokensLine> convert(List<TokenEntry> tokens) {
+    ImmutableList.Builder<TokensLine> result = ImmutableList.builder();
     StringBuilder sb = new StringBuilder();
+    int startLine = Integer.MIN_VALUE;
+    int startIndex = 0;
+    int currentIndex = 0;
     for (TokenEntry token : tokens) {
       if (token != TokenEntry.EOF) {
         String value = token.getValue();
         int line = token.getBeginLine();
-        if (line != currentLine) {
-          addNewStatement(result, currentLine, sb);
-          currentLine = line;
+        if (line != startLine) {
+          addNewTokensLine(result, startIndex, currentIndex, startLine, sb);
+          startIndex = currentIndex + 1;
+          startLine = line;
         }
+        currentIndex++;
         sb.append(value);
       }
     }
-    addNewStatement(result, currentLine, sb);
+    addNewTokensLine(result, startIndex, currentIndex, startLine, sb);
     return result.build();
   }
 
-  private static void addNewStatement(ImmutableList.Builder<Statement> result, int line, StringBuilder sb) {
+  private static void addNewTokensLine(ImmutableList.Builder<TokensLine> result, int startUnit, int endUnit, int startLine, StringBuilder sb) {
     if (sb.length() != 0) {
-      result.add(new Statement(line, line, sb.toString()));
+      result.add(new TokensLine(startUnit, endUnit, startLine, sb.toString().hashCode()));
       sb.setLength(0);
     }
   }
 
-  public void clearCache() {
-    TokenEntry.clearImages();
-  }
-
 }
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokensLine.java b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokensLine.java
new file mode 100644
index 00000000000..2f832bb50af
--- /dev/null
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokensLine.java
@@ -0,0 +1,69 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2012 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
+ */
+package org.sonar.duplications.internal.pmd;
+
+import com.google.common.base.Preconditions;
+import org.sonar.duplications.CodeFragment;
+
+/**
+ * Immutable code fragment, which formed from tokens of one line.
+ */
+class TokensLine implements CodeFragment {
+
+  private final int startLine;
+  private final int hashCode;
+
+  private final int startUnit;
+  private final int endUnit;
+
+  public TokensLine(int startUnit, int endUnit, int startLine, int hashCode) {
+    Preconditions.checkArgument(startLine > 0);
+    // TODO do we have requirements for length and hashcode ?
+    this.startLine = startLine;
+    this.hashCode = hashCode;
+
+    this.startUnit = startUnit;
+    this.endUnit = endUnit;
+  }
+
+  public int getStartLine() {
+    return startLine;
+  }
+
+  /**
+   * Same as {@link #getStartLine()}
+   */
+  public int getEndLine() {
+    return startLine;
+  }
+
+  public int getHashCode() {
+    return hashCode;
+  }
+
+  public int getStartUnit() {
+    return startUnit;
+  }
+
+  public int getEndUnit() {
+    return endUnit;
+  }
+
+}
diff --git a/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java b/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java
index a154792dfb5..7f8a9c473b5 100644
--- a/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java
+++ b/sonar-duplications/src/main/java/org/sonar/duplications/statement/Statement.java
@@ -19,11 +19,12 @@
  */
 package org.sonar.duplications.statement;
 
-import java.util.List;
-
+import org.sonar.duplications.CodeFragment;
 import org.sonar.duplications.token.Token;
 
-public class Statement {
+import java.util.List;
+
+public class Statement implements CodeFragment {
 
   private final int startLine;
   private final int endLine;
@@ -84,8 +85,8 @@ public class Statement {
     }
     Statement other = (Statement) obj;
     return startLine == other.startLine
-        && endLine == other.endLine
-        && value.equals(other.value);
+      && endLine == other.endLine
+      && value.equals(other.value);
   }
 
   @Override
@@ -93,4 +94,4 @@ public class Statement {
     return "[" + getStartLine() + "-" + getEndLine() + "] [" + getValue() + "]";
   }
 
-}
-\ No newline at end of file
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java
index d825084dacf..c7133c4387a 100644
--- a/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/cpd/CPDTest.java
@@ -57,8 +57,9 @@ public class CPDTest {
     assertThat(match.getLineCount(), is(26));
     assertThat(match.getFirstMark().getBeginLine(), is(16));
     assertThat(match.getSourceCodeSlice(), is(nullValue()));
+    assertThat(match.getTokenCount(), is(116));
   }
-  
+
   @Test
   public void testDuplicationOnSameFile() throws IOException {
     TokenEntry.clearImages();
@@ -77,6 +78,7 @@ public class CPDTest {
     assertThat(match.getLineCount(), is(16));
     assertThat(match.getFirstMark().getBeginLine(), is(29));
     assertThat(match.getSourceCodeSlice(), is(nullValue()));
+    assertThat(match.getTokenCount(), is(160));
   }
 
   private List<Match> getMatches(CPD cpd) {
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBlockChunkerTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBlockChunkerTest.java
new file mode 100644
index 00000000000..50b17be524c
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBlockChunkerTest.java
@@ -0,0 +1,56 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2012 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
+ */
+package org.sonar.duplications.internal.pmd;
+
+import org.junit.Test;
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.block.ByteArray;
+
+import java.util.Arrays;
+import java.util.List;
+
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+public class PmdBlockChunkerTest {
+
+  @Test
+  public void shouldBuildBlocks() {
+    TokensLine line1 = new TokensLine(0, 9, 1, 1);
+    TokensLine line2 = new TokensLine(10, 19, 2, 2);
+    TokensLine line3 = new TokensLine(20, 29, 3, 3);
+
+    List<Block> blocks = new PmdBlockChunker(2).chunk("resourceId", Arrays.asList(line1, line2, line3));
+    assertThat(blocks.size(), is(2));
+
+    Block block = blocks.get(0);
+    // assertThat(block.getLengthInUnits(), is(11));
+    assertThat(block.getStartLine(), is(1));
+    assertThat(block.getEndLine(), is(2));
+    assertThat(block.getBlockHash(), is(new ByteArray(1L * 31 + 2)));
+
+    block = blocks.get(1);
+    // assertThat(block.getLengthInUnits(), is(33));
+    assertThat(block.getStartLine(), is(2));
+    assertThat(block.getEndLine(), is(3));
+    assertThat(block.getBlockHash(), is(new ByteArray(2L * 31 + 3)));
+  }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java
new file mode 100644
index 00000000000..60c22a51f85
--- /dev/null
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java
@@ -0,0 +1,102 @@
+/*
+ * Sonar, open source software quality management tool.
+ * Copyright (C) 2008-2012 SonarSource
+ * mailto:contact AT sonarsource DOT com
+ *
+ * Sonar is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * Sonar is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Sonar; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02
+ */
+package org.sonar.duplications.internal.pmd;
+
+import net.sourceforge.pmd.cpd.JavaTokenizer;
+import org.junit.Before;
+import org.junit.Test;
+import org.sonar.duplications.block.Block;
+import org.sonar.duplications.detector.suffixtree.SuffixTreeCloneDetectionAlgorithm;
+import org.sonar.duplications.index.CloneGroup;
+import org.sonar.duplications.index.CloneIndex;
+import org.sonar.duplications.index.ClonePart;
+import org.sonar.duplications.index.PackedMemoryCloneIndex;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.List;
+
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+
+public class PmdBridgeTest {
+
+  private CloneIndex index;
+  private TokenizerBridge bridge;
+
+  @Before
+  public void setUp() {
+    index = new PackedMemoryCloneIndex();
+    bridge = new TokenizerBridge(new JavaTokenizer(), "UTF-8");
+  }
+
+  @Test
+  public void testDuplicationInSingleFile() {
+    File file = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile3.java");
+    addToIndex(file);
+
+    List<CloneGroup> duplications = detect(file);
+    assertThat(duplications.size(), is(1));
+
+    CloneGroup duplication = duplications.get(0);
+    assertThat(duplication.getOriginPart().getResourceId(), is(file.getAbsolutePath()));
+    assertThat(duplication.getCloneParts().size(), is(2));
+    assertThat("length in tokens", duplication.getLengthInUnits(), is(157));
+
+    ClonePart part = duplication.getCloneParts().get(0);
+    assertThat(part.getResourceId(), is(file.getAbsolutePath()));
+    assertThat(part.getStartLine(), is(30));
+    assertThat(part.getEndLine(), is(44));
+  }
+
+  @Test
+  public void testDuplicationBetweenTwoFiles() {
+    File file1 = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile1.java");
+    File file2 = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile2.java");
+    addToIndex(file1);
+    addToIndex(file2);
+
+    List<CloneGroup> duplications = detect(file1);
+    assertThat(duplications.size(), is(1));
+
+    CloneGroup duplication = duplications.get(0);
+    assertThat(duplication.getOriginPart().getResourceId(), is(file1.getAbsolutePath()));
+    assertThat(duplication.getCloneParts().size(), is(2));
+    assertThat("length in tokens", duplication.getLengthInUnits(), is(115));
+
+    ClonePart part = duplication.getCloneParts().get(0);
+    assertThat(part.getResourceId(), is(file1.getAbsolutePath()));
+    assertThat(part.getStartLine(), is(18));
+    assertThat(part.getEndLine(), is(41));
+  }
+
+  private List<CloneGroup> detect(File file) {
+    Collection<Block> fileBlocks = index.getByResourceId(file.getAbsolutePath());
+    return SuffixTreeCloneDetectionAlgorithm.detect(index, fileBlocks);
+  }
+
+  private void addToIndex(File file) {
+    List<Block> blocks = bridge.chunk(file.getAbsolutePath(), file);
+    for (Block block : blocks) {
+      index.insert(block);
+    }
+  }
+
+}
diff --git a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java
index 0d56c5f2da3..ec2bd5d0d9b 100644
--- a/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java
+++ b/sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java
@@ -23,8 +23,8 @@ import net.sourceforge.pmd.cpd.SourceCode;
 import net.sourceforge.pmd.cpd.TokenEntry;
 import net.sourceforge.pmd.cpd.Tokenizer;
 import net.sourceforge.pmd.cpd.Tokens;
+import org.junit.Before;
 import org.junit.Test;
-import org.sonar.duplications.statement.Statement;
 
 import java.io.IOException;
 import java.util.List;
@@ -34,8 +34,10 @@ import static org.junit.Assert.assertThat;
 
 public class TokenizerBridgeTest {
 
-  @Test
-  public void test() {
+  private TokenizerBridge bridge;
+
+  @Before
+  public void setUp() {
     Tokenizer tokenizer = new Tokenizer() {
       public void tokenize(SourceCode tokens, Tokens tokenEntries) throws IOException {
         tokenEntries.add(new TokenEntry("t1", "src", 1));
@@ -43,30 +45,55 @@ public class TokenizerBridgeTest {
         tokenEntries.add(new TokenEntry("t3", "src", 2));
         tokenEntries.add(new TokenEntry("t1", "src", 4));
         tokenEntries.add(new TokenEntry("t3", "src", 4));
+        tokenEntries.add(new TokenEntry("t3", "src", 4));
         tokenEntries.add(TokenEntry.getEOF());
       }
     };
+    bridge = new TokenizerBridge(tokenizer, "UTF-8");
+  }
+
+  @Test
+  public void shouldClearCacheInTokenEntry() {
+    bridge.chunk(null);
+    TokenEntry token = new TokenEntry("image", "srcId", 0);
+    assertThat(token.getIndex(), is(0));
+    assertThat(token.getIdentifier(), is(1));
+  }
+
+  @Test
+  public void test() {
+    // To be sure that token index will be relative to file - run twice:
+    bridge.chunk(null);
+    List<TokensLine> lines = bridge.chunk(null);
+
+    assertThat(lines.size(), is(3));
+
+    TokensLine line = lines.get(0);
+    // 2 tokens on 1 line
+    assertThat(line.getStartUnit(), is(1));
+    assertThat(line.getEndUnit(), is(2));
 
-    TokenizerBridge bridge = new TokenizerBridge(tokenizer, "UTF-8");
-    List<Statement> statements = bridge.tokenize(null);
-    bridge.clearCache();
+    assertThat(line.getStartLine(), is(1));
+    assertThat(line.getEndLine(), is(1));
+    assertThat(line.getHashCode(), is("t1t2".hashCode()));
 
-    assertThat(statements.size(), is(3));
+    line = lines.get(1);
+    // 1 token on 2 line
+    assertThat(line.getStartUnit(), is(3));
+    assertThat(line.getEndUnit(), is(3));
 
-    Statement statement = statements.get(0);
-    assertThat(statement.getStartLine(), is(1));
-    assertThat(statement.getEndLine(), is(1));
-    assertThat(statement.getValue(), is("t1t2"));
+    assertThat(line.getStartLine(), is(2));
+    assertThat(line.getEndLine(), is(2));
+    assertThat(line.getHashCode(), is("t3".hashCode()));
 
-    statement = statements.get(1);
-    assertThat(statement.getStartLine(), is(2));
-    assertThat(statement.getEndLine(), is(2));
-    assertThat(statement.getValue(), is("t3"));
+    line = lines.get(2);
+    // 3 tokens on 4 line
+    assertThat(line.getStartUnit(), is(4));
+    assertThat(line.getEndUnit(), is(6));
 
-    statement = statements.get(2);
-    assertThat(statement.getStartLine(), is(4));
-    assertThat(statement.getEndLine(), is(4));
-    assertThat(statement.getValue(), is("t1t3"));
+    assertThat(line.getStartLine(), is(4));
+    assertThat(line.getEndLine(), is(4));
+    assertThat(line.getHashCode(), is("t1t3t3".hashCode()));
   }
 
 }
author	Evgeny Mandrikov <mandrikov@gmail.com>	2012-01-25 19:07:33 +0400
committer	Evgeny Mandrikov <mandrikov@gmail.com>	2012-01-26 13:58:50 +0400
commit	90a6c00024ca6fc4476c6ecf7ab30e44e397240f (patch)
tree	ab3476c8b19b4d0bdce94c7119a230c56ef9cebf /sonar-duplications
parent	f185818faa11af412a75e33bf844bd3f21929e03 (diff)
download	sonarqube-90a6c00024ca6fc4476c6ecf7ab30e44e397240f.tar.gz sonarqube-90a6c00024ca6fc4476c6ecf7ab30e44e397240f.zip