Browse Source

SONAR-6182 Log a warning when unmappable character is detected in a file

tags/5.1-RC1
Julien HENRY 9 years ago
parent
commit
e14a07c4fa

+ 15
- 24
sonar-batch/src/main/java/org/sonar/batch/index/SourcePersister.java View File

@@ -19,6 +19,7 @@
*/
package org.sonar.batch.index;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.ibatis.session.ResultContext;
import org.apache.ibatis.session.ResultHandler;
@@ -26,6 +27,8 @@ import org.sonar.api.batch.fs.InputFile;
import org.sonar.api.batch.fs.internal.DefaultInputFile;
import org.sonar.api.utils.System2;
import org.sonar.batch.ProjectTree;
import org.sonar.batch.scan.filesystem.FileMetadata;
import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer;
import org.sonar.batch.scan.filesystem.InputFileMetadata;
import org.sonar.batch.scan.filesystem.InputPathCache;
import org.sonar.core.persistence.DbSession;
@@ -34,10 +37,9 @@ import org.sonar.core.source.db.FileSourceDto;
import org.sonar.core.source.db.FileSourceMapper;

import javax.annotation.CheckForNull;
import javax.annotation.Nullable;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Map;

@@ -98,7 +100,7 @@ public class SourcePersister implements ScanPersister {
.setBinaryData(data)
.setDataHash(dataHash)
.setSrcHash(metadata.hash())
.setLineHashes(lineHashesAsMd5Hex(inputFile, metadata))
.setLineHashes(lineHashesAsMd5Hex(inputFile))
.setCreatedAt(system2.now())
.setUpdatedAt(system2.now());
mapper.insert(dto);
@@ -110,7 +112,7 @@ public class SourcePersister implements ScanPersister {
.setBinaryData(data)
.setDataHash(dataHash)
.setSrcHash(metadata.hash())
.setLineHashes(lineHashesAsMd5Hex(inputFile, metadata))
.setLineHashes(lineHashesAsMd5Hex(inputFile))
.setUpdatedAt(system2.now());
mapper.update(previousDto);
session.commit();
@@ -119,34 +121,23 @@ public class SourcePersister implements ScanPersister {
}

@CheckForNull
private String lineHashesAsMd5Hex(DefaultInputFile f, InputFileMetadata metadata) {
private String lineHashesAsMd5Hex(DefaultInputFile f) {
if (f.lines() == 0) {
return null;
}
// A md5 string is 32 char long + '\n' = 33
StringBuilder result = new StringBuilder(f.lines() * (32 + 1));
final StringBuilder result = new StringBuilder(f.lines() * (32 + 1));

try {
BufferedReader reader = Files.newBufferedReader(f.path(), f.charset());
StringBuilder sb = new StringBuilder();
for (int i = 0; i < f.lines(); i++) {
String lineStr = reader.readLine();
lineStr = lineStr == null ? "" : lineStr;
for (int j = 0; j < lineStr.length(); j++) {
char c = lineStr.charAt(j);
if (!Character.isWhitespace(c)) {
sb.append(c);
}
}
if (i > 0) {
FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() {

@Override
public void consume(int lineIdx, @Nullable byte[] hash) {
if (lineIdx > 0) {
result.append("\n");
}
result.append(sb.length() > 0 ? DigestUtils.md5Hex(sb.toString()) : "");
sb.setLength(0);
result.append(hash != null ? Hex.encodeHexString(hash) : "");
}
} catch (Exception e) {
throw new IllegalStateException("Unable to compute line hashes of file " + f, e);
}
});

return result.toString();
}

+ 11
- 25
sonar-batch/src/main/java/org/sonar/batch/issue/tracking/FileHashes.java View File

@@ -19,17 +19,16 @@
*/
package org.sonar.batch.issue.tracking;

import com.google.common.base.Charsets;
import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Multimap;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.ObjectUtils;
import org.sonar.api.batch.fs.internal.DefaultInputFile;
import org.sonar.batch.scan.filesystem.FileMetadata;
import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer;

import javax.annotation.Nullable;

import java.io.BufferedReader;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.util.Collection;

/**
@@ -56,27 +55,14 @@ public final class FileHashes {
}

public static FileHashes create(DefaultInputFile f) {
byte[][] hashes = new byte[f.lines()][];
try {
BufferedReader reader = Files.newBufferedReader(f.path(), f.charset());
MessageDigest lineMd5Digest = DigestUtils.getMd5Digest();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < f.lines(); i++) {
String lineStr = reader.readLine();
if (lineStr != null) {
for (int j = 0; j < lineStr.length(); j++) {
char c = lineStr.charAt(j);
if (!Character.isWhitespace(c)) {
sb.append(c);
}
}
}
hashes[i] = sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null;
sb.setLength(0);
final byte[][] hashes = new byte[f.lines()][];
FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() {

@Override
public void consume(int lineIdx, @Nullable byte[] hash) {
hashes[lineIdx - 1] = hash;
}
} catch (Exception e) {
throw new IllegalStateException("Unable to compute line hashes of file " + f, e);
}
});

int size = hashes.length;
Multimap<String, Integer> linesByHash = LinkedHashMultimap.create();

+ 80
- 16
sonar-batch/src/main/java/org/sonar/batch/scan/filesystem/FileMetadata.java View File

@@ -25,10 +25,15 @@ import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.sonar.api.BatchComponent;
import org.sonar.api.CoreProperties;
import org.sonar.api.batch.AnalysisMode;
import org.sonar.api.batch.fs.internal.DefaultInputFile;

import javax.annotation.CheckForNull;
import javax.annotation.Nullable;

import java.io.BufferedReader;
import java.io.File;
@@ -47,6 +52,8 @@ import java.util.List;
*/
public class FileMetadata implements BatchComponent {

private static final Logger LOG = LoggerFactory.getLogger(FileMetadata.class);

private static final char LINE_FEED = '\n';
private static final char CARRIAGE_RETURN = '\r';
private final AnalysisMode analysisMode;
@@ -55,7 +62,7 @@ public class FileMetadata implements BatchComponent {
this.analysisMode = analysisMode;
}

private abstract class CharHandler {
private static abstract class CharHandler {

void handleAll(char c) {
}
@@ -70,15 +77,28 @@ public class FileMetadata implements BatchComponent {
}
}

private class LineCounter extends CharHandler {
private static class LineCounter extends CharHandler {
private boolean empty = true;
private int lines = 1;
private int nonBlankLines = 0;
private boolean blankLine = true;
boolean alreadyLoggedInvalidCharacter = false;
private final File file;
private final Charset encoding;

LineCounter(File file, Charset encoding) {
this.file = file;
this.encoding = encoding;
}

@Override
void handleAll(char c) {
this.empty = false;
if (!alreadyLoggedInvalidCharacter && c == '\ufffd') {
LOG.warn("Invalid character encountered in file " + file + " at line " + lines
+ " for encoding " + encoding + ". Please fix file content or configure the encoding to be used using property '" + CoreProperties.ENCODING_PROPERTY + "'.");
alreadyLoggedInvalidCharacter = true;
}
}

@Override
@@ -117,10 +137,9 @@ public class FileMetadata implements BatchComponent {
}
}

private class FileHashComputer extends CharHandler {
private static class FileHashComputer extends CharHandler {
private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest();

StringBuffer sb = new StringBuffer();
private StringBuffer sb = new StringBuffer();

@Override
void handleIgnoreEoL(char c) {
@@ -147,7 +166,38 @@ public class FileMetadata implements BatchComponent {
}
}

private class LineOffsetCounter extends CharHandler {
private static class LineHashComputer extends CharHandler {
private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest();
private final StringBuffer sb = new StringBuffer();
private final LineHashConsumer consumer;
private int line = 1;

public LineHashComputer(LineHashConsumer consumer) {
this.consumer = consumer;
}

@Override
void handleIgnoreEoL(char c) {
if (!Character.isWhitespace(c)) {
sb.append(c);
}
}

@Override
void newLine() {
consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null);
sb.setLength(0);
line++;
}

@Override
void eof() {
consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null);
}

}

private static class LineOffsetCounter extends CharHandler {
private int currentOriginalOffset = 0;
private List<Integer> originalLineOffsets = new ArrayList<Integer>();

@@ -176,17 +226,21 @@ public class FileMetadata implements BatchComponent {
* Maximum performance is needed.
*/
Metadata read(File file, Charset encoding) {
char c = (char) 0;
LineCounter lineCounter = new LineCounter();
LineCounter lineCounter = new LineCounter(file, encoding);
FileHashComputer fileHashComputer = new FileHashComputer();
LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
CharHandler[] handlers;
if (analysisMode.isPreview()) {
// No need to compute line offsets in preview mode since there is no syntax highlighting
handlers = new CharHandler[] {lineCounter, fileHashComputer};
if (!analysisMode.isPreview()) {
scanFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter);
} else {
handlers = new CharHandler[] {lineCounter, fileHashComputer, lineOffsetCounter};
// No need to compute line offsets in preview mode since there is no syntax highlighting
scanFile(file, encoding, lineCounter, fileHashComputer);
}
return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
lineCounter.isEmpty());
}

private static void scanFile(File file, Charset encoding, CharHandler... handlers) {
char c = (char) 0;
try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file),
ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) {
@@ -224,9 +278,6 @@ public class FileMetadata implements BatchComponent {
for (CharHandler handler : handlers) {
handler.eof();
}
return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
lineCounter.isEmpty());

} catch (IOException e) {
throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e);
}
@@ -247,4 +298,17 @@ public class FileMetadata implements BatchComponent {
this.originalLineOffsets = Ints.toArray(originalLineOffsets);
}
}

public static interface LineHashConsumer {

void consume(int lineIdx, @Nullable byte[] hash);

}

/**
* Compute a MD5 hash of each line of the file after removing of all blank chars
*/
public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) {
scanFile(f.file(), f.charset(), new LineHashComputer(consumer));
}
}

+ 3
- 1
sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingDecoratorTest.java View File

@@ -22,6 +22,7 @@ package org.sonar.batch.issue.tracking;
import com.google.common.base.Charsets;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.junit.Before;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
@@ -224,8 +225,9 @@ public class IssueTrackingDecoratorTest {
DefaultInputFile inputFile = mock(DefaultInputFile.class);
java.io.File f = temp.newFile();
when(inputFile.path()).thenReturn(f.toPath());
when(inputFile.file()).thenReturn(f);
when(inputFile.charset()).thenReturn(Charsets.UTF_8);
when(inputFile.lines()).thenReturn(newSource.split("\n").length);
when(inputFile.lines()).thenReturn(StringUtils.countMatches(newSource, "\n") + 1);
FileUtils.write(f, newSource, Charsets.UTF_8);
when(inputFile.key()).thenReturn("foo:Action.java");
when(inputPathCache.getFile("foo", "Action.java")).thenReturn(inputFile);

+ 3
- 1
sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingTest.java View File

@@ -25,6 +25,7 @@ import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
@@ -354,9 +355,10 @@ public class IssueTrackingTest {
DefaultInputFile inputFile = mock(DefaultInputFile.class);
File f = temp.newFile();
when(inputFile.path()).thenReturn(f.toPath());
when(inputFile.file()).thenReturn(f);
when(inputFile.charset()).thenReturn(Charsets.UTF_8);
String data = load(newSource);
when(inputFile.lines()).thenReturn(data.split("\n").length);
when(inputFile.lines()).thenReturn(StringUtils.countMatches(data, "\n") + 1);
FileUtils.write(f, data, Charsets.UTF_8);
when(inputFile.key()).thenReturn("foo:Action.java");
when(lastSnapshots.getLineHashes("foo:Action.java")).thenReturn(computeHexHashes(load(reference)));

+ 12
- 0
sonar-batch/src/test/java/org/sonar/batch/scan/filesystem/FileMetadataTest.java View File

@@ -28,6 +28,7 @@ import org.junit.rules.TemporaryFolder;
import org.sonar.api.batch.AnalysisMode;

import java.io.File;
import java.nio.charset.Charset;

import static org.apache.commons.codec.digest.DigestUtils.md5Hex;
import static org.assertj.core.api.Assertions.assertThat;
@@ -69,6 +70,17 @@ public class FileMetadataTest {
assertThat(metadata.empty).isFalse();
}

@Test
public void read_with_wrong_encoding() throws Exception {
File tempFile = temp.newFile();
FileUtils.write(tempFile, "marker´s\n", Charset.forName("cp1252"));

FileMetadata.Metadata metadata = new FileMetadata(mode).read(tempFile, Charsets.UTF_8);
assertThat(metadata.lines).isEqualTo(2);
assertThat(metadata.hash).isEqualTo(md5Hex("marker\ufffds\n"));
assertThat(metadata.originalLineOffsets).containsOnly(0, 9);
}

@Test
public void non_ascii_utf_8() throws Exception {
File tempFile = temp.newFile();

Loading…
Cancel
Save