From e14a07c4fa5ae1c4a1e407f2a5407a8eb0b3428f Mon Sep 17 00:00:00 2001 From: Julien HENRY Date: Thu, 12 Feb 2015 14:31:52 +0100 Subject: [PATCH] SONAR-6182 Log a warning when unmappable character is detected in a file --- .../sonar/batch/index/SourcePersister.java | 39 +++----- .../batch/issue/tracking/FileHashes.java | 36 +++---- .../batch/scan/filesystem/FileMetadata.java | 96 +++++++++++++++---- .../tracking/IssueTrackingDecoratorTest.java | 4 +- .../issue/tracking/IssueTrackingTest.java | 4 +- .../scan/filesystem/FileMetadataTest.java | 12 +++ 6 files changed, 124 insertions(+), 67 deletions(-) diff --git a/sonar-batch/src/main/java/org/sonar/batch/index/SourcePersister.java b/sonar-batch/src/main/java/org/sonar/batch/index/SourcePersister.java index 14cd85e371a..cf1fae2da06 100644 --- a/sonar-batch/src/main/java/org/sonar/batch/index/SourcePersister.java +++ b/sonar-batch/src/main/java/org/sonar/batch/index/SourcePersister.java @@ -19,6 +19,7 @@ */ package org.sonar.batch.index; +import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.digest.DigestUtils; import org.apache.ibatis.session.ResultContext; import org.apache.ibatis.session.ResultHandler; @@ -26,6 +27,8 @@ import org.sonar.api.batch.fs.InputFile; import org.sonar.api.batch.fs.internal.DefaultInputFile; import org.sonar.api.utils.System2; import org.sonar.batch.ProjectTree; +import org.sonar.batch.scan.filesystem.FileMetadata; +import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer; import org.sonar.batch.scan.filesystem.InputFileMetadata; import org.sonar.batch.scan.filesystem.InputPathCache; import org.sonar.core.persistence.DbSession; @@ -34,10 +37,9 @@ import org.sonar.core.source.db.FileSourceDto; import org.sonar.core.source.db.FileSourceMapper; import javax.annotation.CheckForNull; +import javax.annotation.Nullable; -import java.io.BufferedReader; import java.io.IOException; -import java.nio.file.Files; import java.util.HashMap; import java.util.Map; @@ -98,7 +100,7 @@ public class SourcePersister implements ScanPersister { .setBinaryData(data) .setDataHash(dataHash) .setSrcHash(metadata.hash()) - .setLineHashes(lineHashesAsMd5Hex(inputFile, metadata)) + .setLineHashes(lineHashesAsMd5Hex(inputFile)) .setCreatedAt(system2.now()) .setUpdatedAt(system2.now()); mapper.insert(dto); @@ -110,7 +112,7 @@ public class SourcePersister implements ScanPersister { .setBinaryData(data) .setDataHash(dataHash) .setSrcHash(metadata.hash()) - .setLineHashes(lineHashesAsMd5Hex(inputFile, metadata)) + .setLineHashes(lineHashesAsMd5Hex(inputFile)) .setUpdatedAt(system2.now()); mapper.update(previousDto); session.commit(); @@ -119,34 +121,23 @@ public class SourcePersister implements ScanPersister { } @CheckForNull - private String lineHashesAsMd5Hex(DefaultInputFile f, InputFileMetadata metadata) { + private String lineHashesAsMd5Hex(DefaultInputFile f) { if (f.lines() == 0) { return null; } // A md5 string is 32 char long + '\n' = 33 - StringBuilder result = new StringBuilder(f.lines() * (32 + 1)); + final StringBuilder result = new StringBuilder(f.lines() * (32 + 1)); - try { - BufferedReader reader = Files.newBufferedReader(f.path(), f.charset()); - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < f.lines(); i++) { - String lineStr = reader.readLine(); - lineStr = lineStr == null ? "" : lineStr; - for (int j = 0; j < lineStr.length(); j++) { - char c = lineStr.charAt(j); - if (!Character.isWhitespace(c)) { - sb.append(c); - } - } - if (i > 0) { + FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() { + + @Override + public void consume(int lineIdx, @Nullable byte[] hash) { + if (lineIdx > 0) { result.append("\n"); } - result.append(sb.length() > 0 ? DigestUtils.md5Hex(sb.toString()) : ""); - sb.setLength(0); + result.append(hash != null ? Hex.encodeHexString(hash) : ""); } - } catch (Exception e) { - throw new IllegalStateException("Unable to compute line hashes of file " + f, e); - } + }); return result.toString(); } diff --git a/sonar-batch/src/main/java/org/sonar/batch/issue/tracking/FileHashes.java b/sonar-batch/src/main/java/org/sonar/batch/issue/tracking/FileHashes.java index fbc293e8004..996351c26f6 100644 --- a/sonar-batch/src/main/java/org/sonar/batch/issue/tracking/FileHashes.java +++ b/sonar-batch/src/main/java/org/sonar/batch/issue/tracking/FileHashes.java @@ -19,17 +19,16 @@ */ package org.sonar.batch.issue.tracking; -import com.google.common.base.Charsets; import com.google.common.collect.LinkedHashMultimap; import com.google.common.collect.Multimap; import org.apache.commons.codec.binary.Hex; -import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang.ObjectUtils; import org.sonar.api.batch.fs.internal.DefaultInputFile; +import org.sonar.batch.scan.filesystem.FileMetadata; +import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer; + +import javax.annotation.Nullable; -import java.io.BufferedReader; -import java.nio.file.Files; -import java.security.MessageDigest; import java.util.Collection; /** @@ -56,27 +55,14 @@ public final class FileHashes { } public static FileHashes create(DefaultInputFile f) { - byte[][] hashes = new byte[f.lines()][]; - try { - BufferedReader reader = Files.newBufferedReader(f.path(), f.charset()); - MessageDigest lineMd5Digest = DigestUtils.getMd5Digest(); - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < f.lines(); i++) { - String lineStr = reader.readLine(); - if (lineStr != null) { - for (int j = 0; j < lineStr.length(); j++) { - char c = lineStr.charAt(j); - if (!Character.isWhitespace(c)) { - sb.append(c); - } - } - } - hashes[i] = sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null; - sb.setLength(0); + final byte[][] hashes = new byte[f.lines()][]; + FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() { + + @Override + public void consume(int lineIdx, @Nullable byte[] hash) { + hashes[lineIdx - 1] = hash; } - } catch (Exception e) { - throw new IllegalStateException("Unable to compute line hashes of file " + f, e); - } + }); int size = hashes.length; Multimap linesByHash = LinkedHashMultimap.create(); diff --git a/sonar-batch/src/main/java/org/sonar/batch/scan/filesystem/FileMetadata.java b/sonar-batch/src/main/java/org/sonar/batch/scan/filesystem/FileMetadata.java index 0f91e4ae3cc..6e3031a9059 100644 --- a/sonar-batch/src/main/java/org/sonar/batch/scan/filesystem/FileMetadata.java +++ b/sonar-batch/src/main/java/org/sonar/batch/scan/filesystem/FileMetadata.java @@ -25,10 +25,15 @@ import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.BOMInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.sonar.api.BatchComponent; +import org.sonar.api.CoreProperties; import org.sonar.api.batch.AnalysisMode; +import org.sonar.api.batch.fs.internal.DefaultInputFile; import javax.annotation.CheckForNull; +import javax.annotation.Nullable; import java.io.BufferedReader; import java.io.File; @@ -47,6 +52,8 @@ import java.util.List; */ public class FileMetadata implements BatchComponent { + private static final Logger LOG = LoggerFactory.getLogger(FileMetadata.class); + private static final char LINE_FEED = '\n'; private static final char CARRIAGE_RETURN = '\r'; private final AnalysisMode analysisMode; @@ -55,7 +62,7 @@ public class FileMetadata implements BatchComponent { this.analysisMode = analysisMode; } - private abstract class CharHandler { + private static abstract class CharHandler { void handleAll(char c) { } @@ -70,15 +77,28 @@ public class FileMetadata implements BatchComponent { } } - private class LineCounter extends CharHandler { + private static class LineCounter extends CharHandler { private boolean empty = true; private int lines = 1; private int nonBlankLines = 0; private boolean blankLine = true; + boolean alreadyLoggedInvalidCharacter = false; + private final File file; + private final Charset encoding; + + LineCounter(File file, Charset encoding) { + this.file = file; + this.encoding = encoding; + } @Override void handleAll(char c) { this.empty = false; + if (!alreadyLoggedInvalidCharacter && c == '\ufffd') { + LOG.warn("Invalid character encountered in file " + file + " at line " + lines + + " for encoding " + encoding + ". Please fix file content or configure the encoding to be used using property '" + CoreProperties.ENCODING_PROPERTY + "'."); + alreadyLoggedInvalidCharacter = true; + } } @Override @@ -117,10 +137,9 @@ public class FileMetadata implements BatchComponent { } } - private class FileHashComputer extends CharHandler { + private static class FileHashComputer extends CharHandler { private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest(); - - StringBuffer sb = new StringBuffer(); + private StringBuffer sb = new StringBuffer(); @Override void handleIgnoreEoL(char c) { @@ -147,7 +166,38 @@ public class FileMetadata implements BatchComponent { } } - private class LineOffsetCounter extends CharHandler { + private static class LineHashComputer extends CharHandler { + private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest(); + private final StringBuffer sb = new StringBuffer(); + private final LineHashConsumer consumer; + private int line = 1; + + public LineHashComputer(LineHashConsumer consumer) { + this.consumer = consumer; + } + + @Override + void handleIgnoreEoL(char c) { + if (!Character.isWhitespace(c)) { + sb.append(c); + } + } + + @Override + void newLine() { + consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null); + sb.setLength(0); + line++; + } + + @Override + void eof() { + consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null); + } + + } + + private static class LineOffsetCounter extends CharHandler { private int currentOriginalOffset = 0; private List originalLineOffsets = new ArrayList(); @@ -176,17 +226,21 @@ public class FileMetadata implements BatchComponent { * Maximum performance is needed. */ Metadata read(File file, Charset encoding) { - char c = (char) 0; - LineCounter lineCounter = new LineCounter(); + LineCounter lineCounter = new LineCounter(file, encoding); FileHashComputer fileHashComputer = new FileHashComputer(); LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); - CharHandler[] handlers; - if (analysisMode.isPreview()) { - // No need to compute line offsets in preview mode since there is no syntax highlighting - handlers = new CharHandler[] {lineCounter, fileHashComputer}; + if (!analysisMode.isPreview()) { + scanFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter); } else { - handlers = new CharHandler[] {lineCounter, fileHashComputer, lineOffsetCounter}; + // No need to compute line offsets in preview mode since there is no syntax highlighting + scanFile(file, encoding, lineCounter, fileHashComputer); } + return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), + lineCounter.isEmpty()); + } + + private static void scanFile(File file, Charset encoding, CharHandler... handlers) { + char c = (char) 0; try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file), ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) { @@ -224,9 +278,6 @@ public class FileMetadata implements BatchComponent { for (CharHandler handler : handlers) { handler.eof(); } - return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), - lineCounter.isEmpty()); - } catch (IOException e) { throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e); } @@ -247,4 +298,17 @@ public class FileMetadata implements BatchComponent { this.originalLineOffsets = Ints.toArray(originalLineOffsets); } } + + public static interface LineHashConsumer { + + void consume(int lineIdx, @Nullable byte[] hash); + + } + + /** + * Compute a MD5 hash of each line of the file after removing of all blank chars + */ + public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) { + scanFile(f.file(), f.charset(), new LineHashComputer(consumer)); + } } diff --git a/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingDecoratorTest.java b/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingDecoratorTest.java index 1c5e080cd06..439457caa78 100644 --- a/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingDecoratorTest.java +++ b/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingDecoratorTest.java @@ -22,6 +22,7 @@ package org.sonar.batch.issue.tracking; import com.google.common.base.Charsets; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.FileUtils; +import org.apache.commons.lang.StringUtils; import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -224,8 +225,9 @@ public class IssueTrackingDecoratorTest { DefaultInputFile inputFile = mock(DefaultInputFile.class); java.io.File f = temp.newFile(); when(inputFile.path()).thenReturn(f.toPath()); + when(inputFile.file()).thenReturn(f); when(inputFile.charset()).thenReturn(Charsets.UTF_8); - when(inputFile.lines()).thenReturn(newSource.split("\n").length); + when(inputFile.lines()).thenReturn(StringUtils.countMatches(newSource, "\n") + 1); FileUtils.write(f, newSource, Charsets.UTF_8); when(inputFile.key()).thenReturn("foo:Action.java"); when(inputPathCache.getFile("foo", "Action.java")).thenReturn(inputFile); diff --git a/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingTest.java b/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingTest.java index bcc25ac598c..095429f3cbf 100644 --- a/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingTest.java +++ b/sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingTest.java @@ -25,6 +25,7 @@ import com.google.common.collect.Lists; import com.google.common.io.Resources; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.FileUtils; +import org.apache.commons.lang.StringUtils; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -354,9 +355,10 @@ public class IssueTrackingTest { DefaultInputFile inputFile = mock(DefaultInputFile.class); File f = temp.newFile(); when(inputFile.path()).thenReturn(f.toPath()); + when(inputFile.file()).thenReturn(f); when(inputFile.charset()).thenReturn(Charsets.UTF_8); String data = load(newSource); - when(inputFile.lines()).thenReturn(data.split("\n").length); + when(inputFile.lines()).thenReturn(StringUtils.countMatches(data, "\n") + 1); FileUtils.write(f, data, Charsets.UTF_8); when(inputFile.key()).thenReturn("foo:Action.java"); when(lastSnapshots.getLineHashes("foo:Action.java")).thenReturn(computeHexHashes(load(reference))); diff --git a/sonar-batch/src/test/java/org/sonar/batch/scan/filesystem/FileMetadataTest.java b/sonar-batch/src/test/java/org/sonar/batch/scan/filesystem/FileMetadataTest.java index b34c5d50271..b05c9cc414c 100644 --- a/sonar-batch/src/test/java/org/sonar/batch/scan/filesystem/FileMetadataTest.java +++ b/sonar-batch/src/test/java/org/sonar/batch/scan/filesystem/FileMetadataTest.java @@ -28,6 +28,7 @@ import org.junit.rules.TemporaryFolder; import org.sonar.api.batch.AnalysisMode; import java.io.File; +import java.nio.charset.Charset; import static org.apache.commons.codec.digest.DigestUtils.md5Hex; import static org.assertj.core.api.Assertions.assertThat; @@ -69,6 +70,17 @@ public class FileMetadataTest { assertThat(metadata.empty).isFalse(); } + @Test + public void read_with_wrong_encoding() throws Exception { + File tempFile = temp.newFile(); + FileUtils.write(tempFile, "marker´s\n", Charset.forName("cp1252")); + + FileMetadata.Metadata metadata = new FileMetadata(mode).read(tempFile, Charsets.UTF_8); + assertThat(metadata.lines).isEqualTo(2); + assertThat(metadata.hash).isEqualTo(md5Hex("marker\ufffds\n")); + assertThat(metadata.originalLineOffsets).containsOnly(0, 9); + } + @Test public void non_ascii_utf_8() throws Exception { File tempFile = temp.newFile(); -- 2.39.5