]> source.dussan.org Git - sonarqube.git/commitdiff
SONAR-9204 File with BOM are not correctly parsed by the CPD tokenizer
authorJulien HENRY <julien.henry@sonarsource.com>
Tue, 9 May 2017 12:17:18 +0000 (14:17 +0200)
committerJulien HENRY <henryju@yahoo.fr>
Tue, 9 May 2017 16:02:07 +0000 (18:02 +0200)
plugins/sonar-xoo-plugin/src/main/java/org/sonar/xoo/lang/XooTokenizer.java
plugins/sonar-xoo-plugin/src/test/java/org/sonar/xoo/lang/XooTokenizerTest.java
sonar-duplications/src/main/java/org/sonar/duplications/cpd/FileCodeLoaderWithoutCache.java
sonar-duplications/src/main/java/org/sonar/duplications/internal/pmd/TokenizerBridge.java
sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/PmdBridgeTest.java
sonar-duplications/src/test/java/org/sonar/duplications/internal/pmd/TokenizerBridgeTest.java
sonar-scanner-engine/src/main/java/org/sonar/scanner/cpd/deprecated/DefaultCpdBlockIndexer.java
sonar-scanner-engine/src/main/java/org/sonar/scanner/cpd/deprecated/DeprecatedCpdBlockIndexerSensor.java
sonar-scanner-engine/src/main/java/org/sonar/scanner/cpd/deprecated/JavaCpdBlockIndexer.java
sonar-scanner-engine/src/main/java/org/sonar/scanner/report/SourcePublisher.java
sonar-scanner-engine/src/test/java/org/sonar/scanner/cpd/deprecated/JavaCpdBlockIndexerTest.java

index d57467c985945448bf587f888f522fb5b3942dc5..0aa6bffb27c06ea384751913b10b9192455bc22c 100644 (file)
@@ -48,16 +48,12 @@ public class XooTokenizer implements Tokenizer {
     String fileName = source.getFileName();
     LOG.info("Using deprecated tokenizer extension point to tokenize {}", fileName);
     int lineIdx = 1;
-    try {
-      for (String line : FileUtils.readLines(new File(fileName), fs.encoding())) {
-        for (String token : Splitter.on(" ").split(line)) {
-          TokenEntry cpdToken = new TokenEntry(token, fileName, lineIdx);
-          cpdTokens.add(cpdToken);
-        }
-        lineIdx++;
+    for (String line : source.getCode()) {
+      for (String token : Splitter.on(" ").split(line)) {
+        TokenEntry cpdToken = new TokenEntry(token, fileName, lineIdx);
+        cpdTokens.add(cpdToken);
       }
-    } catch (IOException e) {
-      throw new IllegalStateException("Unable to tokenize", e);
+      lineIdx++;
     }
     cpdTokens.add(TokenEntry.getEOF());
   }
index c6a5cbf9859d946fe7332cbd5790b156b0a70bf6..967f87dafdbc52b39ee86b6459ab1c1696db5fbe 100644 (file)
  */
 package org.sonar.xoo.lang;
 
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 import net.sourceforge.pmd.cpd.SourceCode;
 import net.sourceforge.pmd.cpd.TokenEntry;
 import net.sourceforge.pmd.cpd.Tokens;
@@ -31,11 +35,9 @@ import org.sonar.api.batch.fs.internal.DefaultFileSystem;
 import org.sonar.api.batch.fs.internal.DefaultInputFile;
 import org.sonar.api.batch.fs.internal.TestInputFileBuilder;
 import org.sonar.api.batch.sensor.SensorContext;
-import org.sonar.api.config.Settings;
-
-import java.io.File;
-import java.io.IOException;
 import org.sonar.api.config.MapSettings;
+import org.sonar.api.config.Settings;
+import org.sonar.duplications.cpd.FileCodeLoaderWithoutCache;
 
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.mockito.Mockito.mock;
@@ -63,18 +65,20 @@ public class XooTokenizerTest {
   @Test
   public void testExecution() throws IOException {
     File source = new File(baseDir, "src/foo.xoo");
-    FileUtils.write(source, "token1 token2 token3\ntoken4");
+    FileUtils.write(source, "token1 token2 token3\ntoken4", StandardCharsets.UTF_8);
     DefaultInputFile inputFile = new TestInputFileBuilder("foo", "src/foo.xoo")
       .setLanguage("xoo")
       .setModuleBaseDir(baseDir.toPath())
+      .setCharset(StandardCharsets.UTF_8)
       .build();
     fileSystem.add(inputFile);
 
     XooTokenizer tokenizer = new XooTokenizer(fileSystem);
-    SourceCode sourceCode = mock(SourceCode.class);
-    when(sourceCode.getFileName()).thenReturn(inputFile.absolutePath());
     Tokens cpdTokens = new Tokens();
-    tokenizer.tokenize(sourceCode, cpdTokens);
+    try (InputStreamReader reader = new InputStreamReader(inputFile.inputStream(), inputFile.charset())) {
+      SourceCode sourceCode = new SourceCode(new FileCodeLoaderWithoutCache(inputFile.absolutePath(), reader));
+      tokenizer.tokenize(sourceCode, cpdTokens);
+    }
 
     // 4 tokens + EOF
     assertThat(cpdTokens.getTokens()).hasSize(5);
index b24bafa56a15b15b45fbfc21471cef7418174b33..b3ac2c69cd80219159ab0cd5c340d5c59fee88c2 100644 (file)
  */
 package org.sonar.duplications.cpd;
 
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
 
 public class FileCodeLoaderWithoutCache extends CodeLoaderWithoutCache {
 
-  private File file;
-  private String encoding;
+  private final String fileName;
+  private final Reader fileReader;
 
-  public FileCodeLoaderWithoutCache(File file, String encoding) {
-    this.file = file;
-    this.encoding = encoding;
+  public FileCodeLoaderWithoutCache(String fileName, Reader fileReader) {
+    this.fileName = fileName;
+    this.fileReader = fileReader;
   }
 
   @Override
   public Reader getReader() throws Exception {
-    return new InputStreamReader(new FileInputStream(file), encoding);
+    return fileReader;
   }
 
   @Override
   public String getFileName() {
-    return this.file.getAbsolutePath();
+    return fileName;
   }
 }
index a32fe6054e1c882abccc1cc28be2727564f4ee14..0fc08827663e1a813516528c69758b1be1c11a84 100644 (file)
@@ -21,6 +21,9 @@ package org.sonar.duplications.internal.pmd;
 
 import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.List;
 import net.sourceforge.pmd.cpd.SourceCode;
 import net.sourceforge.pmd.cpd.TokenEntry;
 import net.sourceforge.pmd.cpd.Tokenizer;
@@ -28,32 +31,25 @@ import net.sourceforge.pmd.cpd.Tokens;
 import org.sonar.duplications.block.Block;
 import org.sonar.duplications.cpd.FileCodeLoaderWithoutCache;
 
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
-
 /**
  * Bridge, which allows to convert list of {@link TokenEntry} produced by {@link Tokenizer} into list of {@link TokensLine}s.
  */
 public class TokenizerBridge {
 
   private final Tokenizer tokenizer;
-  private final String encoding;
   private final PmdBlockChunker blockBuilder;
 
-  public TokenizerBridge(Tokenizer tokenizer, String encoding, int blockSize) {
+  public TokenizerBridge(Tokenizer tokenizer, int blockSize) {
     this.tokenizer = tokenizer;
-    this.encoding = encoding;
     this.blockBuilder = new PmdBlockChunker(blockSize);
   }
 
-  // TODO remove from here
-  public List<Block> chunk(String resourceId, File file) {
-    return blockBuilder.chunk(resourceId, chunk(file));
+  public List<Block> chunk(String resourceId, String fileName, Reader fileReader) {
+    return blockBuilder.chunk(resourceId, chunk(fileName, fileReader));
   }
 
-  public List<TokensLine> chunk(File file) {
-    SourceCode sourceCode = new SourceCode(new FileCodeLoaderWithoutCache(file, encoding));
+  public List<TokensLine> chunk(String fileName, Reader fileReader) {
+    SourceCode sourceCode = new SourceCode(new FileCodeLoaderWithoutCache(fileName, fileReader));
     Tokens tokens = new Tokens();
     TokenEntry.clearImages();
     try {
index d3bac18a7f4859f92e547b292dce8abf0840164c..0d142829d2399181ef7b98c78cd0dbbdbced0c31 100644 (file)
  */
 package org.sonar.duplications.internal.pmd;
 
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.util.Collection;
+import java.util.List;
 import net.sourceforge.pmd.cpd.JavaTokenizer;
 import org.junit.Before;
 import org.junit.Test;
@@ -29,10 +35,6 @@ import org.sonar.duplications.index.CloneIndex;
 import org.sonar.duplications.index.ClonePart;
 import org.sonar.duplications.index.PackedMemoryCloneIndex;
 
-import java.io.File;
-import java.util.Collection;
-import java.util.List;
-
 import static org.assertj.core.api.Assertions.assertThat;
 
 public class PmdBridgeTest {
@@ -43,11 +45,11 @@ public class PmdBridgeTest {
   @Before
   public void setUp() {
     index = new PackedMemoryCloneIndex();
-    bridge = new TokenizerBridge(new JavaTokenizer(), "UTF-8", 10);
+    bridge = new TokenizerBridge(new JavaTokenizer(), 10);
   }
 
   @Test
-  public void testDuplicationInSingleFile() {
+  public void testDuplicationInSingleFile() throws IOException {
     File file = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile3.java");
     addToIndex(file);
 
@@ -66,7 +68,7 @@ public class PmdBridgeTest {
   }
 
   @Test
-  public void testDuplicationBetweenTwoFiles() {
+  public void testDuplicationBetweenTwoFiles() throws IOException {
     File file1 = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile1.java");
     File file2 = new File("test-resources/org/sonar/duplications/cpd/CPDTest/CPDFile2.java");
     addToIndex(file1);
@@ -88,8 +90,8 @@ public class PmdBridgeTest {
     return SuffixTreeCloneDetectionAlgorithm.detect(index, fileBlocks);
   }
 
-  private void addToIndex(File file) {
-    List<Block> blocks = bridge.chunk(file.getAbsolutePath(), file);
+  private void addToIndex(File file) throws IOException {
+    List<Block> blocks = bridge.chunk(file.getAbsolutePath(), file.getAbsolutePath(), Files.newBufferedReader(file.toPath(), StandardCharsets.UTF_8));
     for (Block block : blocks) {
       index.insert(block);
     }
index 8e3d0c9c630cf6bbf8e1ea4ecd3f7d1331422f54..f21a8195a837df18d1f8ecb08f1996cb27a902a6 100644 (file)
  */
 package org.sonar.duplications.internal.pmd;
 
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
 import net.sourceforge.pmd.cpd.SourceCode;
 import net.sourceforge.pmd.cpd.TokenEntry;
 import net.sourceforge.pmd.cpd.Tokenizer;
@@ -26,9 +31,6 @@ import net.sourceforge.pmd.cpd.Tokens;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.IOException;
-import java.util.List;
-
 import static org.hamcrest.Matchers.is;
 import static org.junit.Assert.assertThat;
 
@@ -49,12 +51,12 @@ public class TokenizerBridgeTest {
         tokenEntries.add(TokenEntry.getEOF());
       }
     };
-    bridge = new TokenizerBridge(tokenizer, "UTF-8", 10);
+    bridge = new TokenizerBridge(tokenizer, 10);
   }
 
   @Test
   public void shouldClearCacheInTokenEntry() {
-    bridge.chunk(null);
+    bridge.chunk("file.txt", new InputStreamReader(new ByteArrayInputStream(new byte[0]), StandardCharsets.UTF_8));
     TokenEntry token = new TokenEntry("image", "srcId", 0);
     assertThat(token.getIndex(), is(0));
     assertThat(token.getIdentifier(), is(1));
@@ -63,8 +65,8 @@ public class TokenizerBridgeTest {
   @Test
   public void test() {
     // To be sure that token index will be relative to file - run twice:
-    bridge.chunk(null);
-    List<TokensLine> lines = bridge.chunk(null);
+    bridge.chunk("file.txt", new InputStreamReader(new ByteArrayInputStream(new byte[0]), StandardCharsets.UTF_8));
+    List<TokensLine> lines = bridge.chunk("file.txt", new InputStreamReader(new ByteArrayInputStream(new byte[0]), StandardCharsets.UTF_8));
 
     assertThat(lines.size(), is(3));
 
index adeaf2c20d14d8b8fd3d28a9b0288e77e1271532..587a88a7f16dee534e79dd096b6391850270bc33 100644 (file)
@@ -21,6 +21,8 @@ package org.sonar.scanner.cpd.deprecated;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Lists;
+import java.io.IOException;
+import java.io.InputStreamReader;
 import java.util.List;
 import org.sonar.api.CoreProperties;
 import org.sonar.api.batch.CpdMapping;
@@ -80,12 +82,17 @@ public class DefaultCpdBlockIndexer extends CpdBlockIndexer {
   }
 
   private void populateIndex(String languageKey, List<InputFile> sourceFiles, CpdMapping mapping) {
-    TokenizerBridge bridge = new TokenizerBridge(mapping.getTokenizer(), fs.encoding().name(), getBlockSize(languageKey));
+    TokenizerBridge bridge = new TokenizerBridge(mapping.getTokenizer(), getBlockSize(languageKey));
     for (InputFile inputFile : sourceFiles) {
       if (!index.isIndexed(inputFile)) {
         LOG.debug("Populating index from {}", inputFile.absolutePath());
         String resourceEffectiveKey = ((DefaultInputFile) inputFile).key();
-        List<Block> blocks = bridge.chunk(resourceEffectiveKey, inputFile.file());
+        List<Block> blocks;
+        try (InputStreamReader isr = new InputStreamReader(inputFile.inputStream(), inputFile.charset())) {
+          blocks = bridge.chunk(resourceEffectiveKey, inputFile.absolutePath(), isr);
+        } catch (IOException e) {
+          throw new IllegalStateException("Unable to read content of file " + inputFile.absolutePath(), e);
+        }
         index.insert(inputFile, blocks);
       }
     }
index b8721d3616744353998ee5ef364365a3dc1a4963..862ee124d9e678c64bd57b1a201565121ad2600f 100644 (file)
@@ -72,7 +72,7 @@ public class DeprecatedCpdBlockIndexerSensor implements Sensor {
         LOG.debug("Detection of duplicated code is not supported for {}", language);
         continue;
       }
-      LOG.info("{} is used for {}", blockIndexer, language);
+      LOG.debug("{} is used for {}", blockIndexer.getClass().getName(), language);
       blockIndexer.index(language);
     }
   }
index dc1deca096df6069e0946eefa1bb7bc9ef48fb3c..fc0e105d2c1218cfe616fa70e57786541a92827d 100644 (file)
@@ -20,7 +20,6 @@
 package org.sonar.scanner.cpd.deprecated;
 
 import com.google.common.collect.Lists;
-import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
@@ -91,8 +90,8 @@ public class JavaCpdBlockIndexer extends CpdBlockIndexer {
 
       List<Statement> statements;
 
-      try (InputStream is = new FileInputStream(inputFile.file());
-        Reader reader = new InputStreamReader(is, fs.encoding())) {
+      try (InputStream is = inputFile.inputStream();
+        Reader reader = new InputStreamReader(is, inputFile.charset())) {
         statements = statementChunker.chunk(tokenChunker.chunk(reader));
       } catch (FileNotFoundException e) {
         throw new IllegalStateException("Cannot find file " + inputFile.file(), e);
@@ -100,7 +99,12 @@ public class JavaCpdBlockIndexer extends CpdBlockIndexer {
         throw new IllegalStateException("Exception handling file: " + inputFile.file(), e);
       }
 
-      List<Block> blocks = blockChunker.chunk(resourceEffectiveKey, statements);
+      List<Block> blocks;
+      try {
+        blocks = blockChunker.chunk(resourceEffectiveKey, statements);
+      } catch (Exception e) {
+        throw new IllegalStateException("Cannot process file " + inputFile.file(), e);
+      }
       index.insert(inputFile, blocks);
     }
   }
index a161fa33835c47f8d1c0945b1f6b0aabd0f50d61..131a89c4caa749b0dac9d7c816454bcaa5c3c604 100644 (file)
  */
 package org.sonar.scanner.report;
 
-import org.apache.commons.io.ByteOrderMark;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.BOMInputStream;
-import org.sonar.api.batch.fs.internal.DefaultInputFile;
-import org.sonar.scanner.protocol.output.ScannerReportWriter;
-import org.sonar.scanner.scan.filesystem.InputComponentStore;
-
 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
+import org.apache.commons.io.IOUtils;
+import org.sonar.api.batch.fs.internal.DefaultInputFile;
+import org.sonar.scanner.protocol.output.ScannerReportWriter;
+import org.sonar.scanner.scan.filesystem.InputComponentStore;
 
 public class SourcePublisher implements ReportPublisherStep {
 
@@ -48,9 +45,8 @@ public class SourcePublisher implements ReportPublisherStep {
       File iofile = writer.getSourceFile(inputFile.batchId());
 
       try (FileOutputStream output = new FileOutputStream(iofile);
-        BOMInputStream bomIn = new BOMInputStream(new FileInputStream(inputFile.file()),
-          ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
-        BufferedReader reader = new BufferedReader(new InputStreamReader(bomIn, inputFile.charset()))) {
+        InputStream in = inputFile.inputStream();
+        BufferedReader reader = new BufferedReader(new InputStreamReader(in, inputFile.charset()))) {
         writeSource(reader, output, inputFile.lines());
       } catch (IOException e) {
         throw new IllegalStateException("Unable to store file source in the report", e);
index 29c37b61869711ea1b3743ef49a597085f894010..325bb08789b1811c13f94274155710622b8e2de9 100644 (file)
@@ -21,6 +21,7 @@ package org.sonar.scanner.cpd.deprecated;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import org.apache.commons.io.FileUtils;
 import org.junit.Before;
@@ -36,8 +37,8 @@ import org.sonar.api.batch.fs.FileSystem;
 import org.sonar.api.batch.fs.InputFile;
 import org.sonar.api.batch.fs.internal.DefaultFileSystem;
 import org.sonar.api.batch.fs.internal.TestInputFileBuilder;
-import org.sonar.api.config.Settings;
 import org.sonar.api.config.MapSettings;
+import org.sonar.api.config.Settings;
 import org.sonar.duplications.block.Block;
 import org.sonar.scanner.cpd.index.SonarCpdBlockIndex;
 
@@ -71,6 +72,7 @@ public class JavaCpdBlockIndexerTest {
     DefaultFileSystem fs = new DefaultFileSystem(baseDir);
     file = new TestInputFileBuilder("foo", "src/ManyStatements.java")
       .setModuleBaseDir(baseDir.toPath())
+      .setCharset(StandardCharsets.UTF_8)
       .setLanguage(JAVA).build();
     fs.add(file);
     File ioFile = file.file();