Browse Source

SONAR-6182 Log a warning when unmappable character is detected in a file

tags/5.1-RC1
Julien HENRY 9 years ago
parent
commit
e14a07c4fa

+ 15
- 24
sonar-batch/src/main/java/org/sonar/batch/index/SourcePersister.java View File

*/ */
package org.sonar.batch.index; package org.sonar.batch.index;


import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.ibatis.session.ResultContext; import org.apache.ibatis.session.ResultContext;
import org.apache.ibatis.session.ResultHandler; import org.apache.ibatis.session.ResultHandler;
import org.sonar.api.batch.fs.internal.DefaultInputFile; import org.sonar.api.batch.fs.internal.DefaultInputFile;
import org.sonar.api.utils.System2; import org.sonar.api.utils.System2;
import org.sonar.batch.ProjectTree; import org.sonar.batch.ProjectTree;
import org.sonar.batch.scan.filesystem.FileMetadata;
import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer;
import org.sonar.batch.scan.filesystem.InputFileMetadata; import org.sonar.batch.scan.filesystem.InputFileMetadata;
import org.sonar.batch.scan.filesystem.InputPathCache; import org.sonar.batch.scan.filesystem.InputPathCache;
import org.sonar.core.persistence.DbSession; import org.sonar.core.persistence.DbSession;
import org.sonar.core.source.db.FileSourceMapper; import org.sonar.core.source.db.FileSourceMapper;


import javax.annotation.CheckForNull; import javax.annotation.CheckForNull;
import javax.annotation.Nullable;


import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;


.setBinaryData(data) .setBinaryData(data)
.setDataHash(dataHash) .setDataHash(dataHash)
.setSrcHash(metadata.hash()) .setSrcHash(metadata.hash())
.setLineHashes(lineHashesAsMd5Hex(inputFile, metadata))
.setLineHashes(lineHashesAsMd5Hex(inputFile))
.setCreatedAt(system2.now()) .setCreatedAt(system2.now())
.setUpdatedAt(system2.now()); .setUpdatedAt(system2.now());
mapper.insert(dto); mapper.insert(dto);
.setBinaryData(data) .setBinaryData(data)
.setDataHash(dataHash) .setDataHash(dataHash)
.setSrcHash(metadata.hash()) .setSrcHash(metadata.hash())
.setLineHashes(lineHashesAsMd5Hex(inputFile, metadata))
.setLineHashes(lineHashesAsMd5Hex(inputFile))
.setUpdatedAt(system2.now()); .setUpdatedAt(system2.now());
mapper.update(previousDto); mapper.update(previousDto);
session.commit(); session.commit();
} }


@CheckForNull @CheckForNull
private String lineHashesAsMd5Hex(DefaultInputFile f, InputFileMetadata metadata) {
private String lineHashesAsMd5Hex(DefaultInputFile f) {
if (f.lines() == 0) { if (f.lines() == 0) {
return null; return null;
} }
// A md5 string is 32 char long + '\n' = 33 // A md5 string is 32 char long + '\n' = 33
StringBuilder result = new StringBuilder(f.lines() * (32 + 1));
final StringBuilder result = new StringBuilder(f.lines() * (32 + 1));


try {
BufferedReader reader = Files.newBufferedReader(f.path(), f.charset());
StringBuilder sb = new StringBuilder();
for (int i = 0; i < f.lines(); i++) {
String lineStr = reader.readLine();
lineStr = lineStr == null ? "" : lineStr;
for (int j = 0; j < lineStr.length(); j++) {
char c = lineStr.charAt(j);
if (!Character.isWhitespace(c)) {
sb.append(c);
}
}
if (i > 0) {
FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() {

@Override
public void consume(int lineIdx, @Nullable byte[] hash) {
if (lineIdx > 0) {
result.append("\n"); result.append("\n");
} }
result.append(sb.length() > 0 ? DigestUtils.md5Hex(sb.toString()) : "");
sb.setLength(0);
result.append(hash != null ? Hex.encodeHexString(hash) : "");
} }
} catch (Exception e) {
throw new IllegalStateException("Unable to compute line hashes of file " + f, e);
}
});


return result.toString(); return result.toString();
} }

+ 11
- 25
sonar-batch/src/main/java/org/sonar/batch/issue/tracking/FileHashes.java View File

*/ */
package org.sonar.batch.issue.tracking; package org.sonar.batch.issue.tracking;


import com.google.common.base.Charsets;
import com.google.common.collect.LinkedHashMultimap; import com.google.common.collect.LinkedHashMultimap;
import com.google.common.collect.Multimap; import com.google.common.collect.Multimap;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.ObjectUtils; import org.apache.commons.lang.ObjectUtils;
import org.sonar.api.batch.fs.internal.DefaultInputFile; import org.sonar.api.batch.fs.internal.DefaultInputFile;
import org.sonar.batch.scan.filesystem.FileMetadata;
import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer;

import javax.annotation.Nullable;


import java.io.BufferedReader;
import java.nio.file.Files;
import java.security.MessageDigest;
import java.util.Collection; import java.util.Collection;


/** /**
} }


public static FileHashes create(DefaultInputFile f) { public static FileHashes create(DefaultInputFile f) {
byte[][] hashes = new byte[f.lines()][];
try {
BufferedReader reader = Files.newBufferedReader(f.path(), f.charset());
MessageDigest lineMd5Digest = DigestUtils.getMd5Digest();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < f.lines(); i++) {
String lineStr = reader.readLine();
if (lineStr != null) {
for (int j = 0; j < lineStr.length(); j++) {
char c = lineStr.charAt(j);
if (!Character.isWhitespace(c)) {
sb.append(c);
}
}
}
hashes[i] = sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null;
sb.setLength(0);
final byte[][] hashes = new byte[f.lines()][];
FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() {

@Override
public void consume(int lineIdx, @Nullable byte[] hash) {
hashes[lineIdx - 1] = hash;
} }
} catch (Exception e) {
throw new IllegalStateException("Unable to compute line hashes of file " + f, e);
}
});


int size = hashes.length; int size = hashes.length;
Multimap<String, Integer> linesByHash = LinkedHashMultimap.create(); Multimap<String, Integer> linesByHash = LinkedHashMultimap.create();

+ 80
- 16
sonar-batch/src/main/java/org/sonar/batch/scan/filesystem/FileMetadata.java View File

import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.io.input.BOMInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.sonar.api.BatchComponent; import org.sonar.api.BatchComponent;
import org.sonar.api.CoreProperties;
import org.sonar.api.batch.AnalysisMode; import org.sonar.api.batch.AnalysisMode;
import org.sonar.api.batch.fs.internal.DefaultInputFile;


import javax.annotation.CheckForNull; import javax.annotation.CheckForNull;
import javax.annotation.Nullable;


import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.File; import java.io.File;
*/ */
public class FileMetadata implements BatchComponent { public class FileMetadata implements BatchComponent {


private static final Logger LOG = LoggerFactory.getLogger(FileMetadata.class);

private static final char LINE_FEED = '\n'; private static final char LINE_FEED = '\n';
private static final char CARRIAGE_RETURN = '\r'; private static final char CARRIAGE_RETURN = '\r';
private final AnalysisMode analysisMode; private final AnalysisMode analysisMode;
this.analysisMode = analysisMode; this.analysisMode = analysisMode;
} }


private abstract class CharHandler {
private static abstract class CharHandler {


void handleAll(char c) { void handleAll(char c) {
} }
} }
} }


private class LineCounter extends CharHandler {
private static class LineCounter extends CharHandler {
private boolean empty = true; private boolean empty = true;
private int lines = 1; private int lines = 1;
private int nonBlankLines = 0; private int nonBlankLines = 0;
private boolean blankLine = true; private boolean blankLine = true;
boolean alreadyLoggedInvalidCharacter = false;
private final File file;
private final Charset encoding;

LineCounter(File file, Charset encoding) {
this.file = file;
this.encoding = encoding;
}


@Override @Override
void handleAll(char c) { void handleAll(char c) {
this.empty = false; this.empty = false;
if (!alreadyLoggedInvalidCharacter && c == '\ufffd') {
LOG.warn("Invalid character encountered in file " + file + " at line " + lines
+ " for encoding " + encoding + ". Please fix file content or configure the encoding to be used using property '" + CoreProperties.ENCODING_PROPERTY + "'.");
alreadyLoggedInvalidCharacter = true;
}
} }


@Override @Override
} }
} }


private class FileHashComputer extends CharHandler {
private static class FileHashComputer extends CharHandler {
private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest(); private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest();

StringBuffer sb = new StringBuffer();
private StringBuffer sb = new StringBuffer();


@Override @Override
void handleIgnoreEoL(char c) { void handleIgnoreEoL(char c) {
} }
} }


private class LineOffsetCounter extends CharHandler {
private static class LineHashComputer extends CharHandler {
private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest();
private final StringBuffer sb = new StringBuffer();
private final LineHashConsumer consumer;
private int line = 1;

public LineHashComputer(LineHashConsumer consumer) {
this.consumer = consumer;
}

@Override
void handleIgnoreEoL(char c) {
if (!Character.isWhitespace(c)) {
sb.append(c);
}
}

@Override
void newLine() {
consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null);
sb.setLength(0);
line++;
}

@Override
void eof() {
consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null);
}

}

private static class LineOffsetCounter extends CharHandler {
private int currentOriginalOffset = 0; private int currentOriginalOffset = 0;
private List<Integer> originalLineOffsets = new ArrayList<Integer>(); private List<Integer> originalLineOffsets = new ArrayList<Integer>();


* Maximum performance is needed. * Maximum performance is needed.
*/ */
Metadata read(File file, Charset encoding) { Metadata read(File file, Charset encoding) {
char c = (char) 0;
LineCounter lineCounter = new LineCounter();
LineCounter lineCounter = new LineCounter(file, encoding);
FileHashComputer fileHashComputer = new FileHashComputer(); FileHashComputer fileHashComputer = new FileHashComputer();
LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); LineOffsetCounter lineOffsetCounter = new LineOffsetCounter();
CharHandler[] handlers;
if (analysisMode.isPreview()) {
// No need to compute line offsets in preview mode since there is no syntax highlighting
handlers = new CharHandler[] {lineCounter, fileHashComputer};
if (!analysisMode.isPreview()) {
scanFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter);
} else { } else {
handlers = new CharHandler[] {lineCounter, fileHashComputer, lineOffsetCounter};
// No need to compute line offsets in preview mode since there is no syntax highlighting
scanFile(file, encoding, lineCounter, fileHashComputer);
} }
return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
lineCounter.isEmpty());
}

private static void scanFile(File file, Charset encoding, CharHandler... handlers) {
char c = (char) 0;
try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file), try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file),
ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) { Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) {
for (CharHandler handler : handlers) { for (CharHandler handler : handlers) {
handler.eof(); handler.eof();
} }
return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(),
lineCounter.isEmpty());

} catch (IOException e) { } catch (IOException e) {
throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e); throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e);
} }
this.originalLineOffsets = Ints.toArray(originalLineOffsets); this.originalLineOffsets = Ints.toArray(originalLineOffsets);
} }
} }

public static interface LineHashConsumer {

void consume(int lineIdx, @Nullable byte[] hash);

}

/**
* Compute a MD5 hash of each line of the file after removing of all blank chars
*/
public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) {
scanFile(f.file(), f.charset(), new LineHashComputer(consumer));
}
} }

+ 3
- 1
sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingDecoratorTest.java View File

import com.google.common.base.Charsets; import com.google.common.base.Charsets;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TemporaryFolder; import org.junit.rules.TemporaryFolder;
DefaultInputFile inputFile = mock(DefaultInputFile.class); DefaultInputFile inputFile = mock(DefaultInputFile.class);
java.io.File f = temp.newFile(); java.io.File f = temp.newFile();
when(inputFile.path()).thenReturn(f.toPath()); when(inputFile.path()).thenReturn(f.toPath());
when(inputFile.file()).thenReturn(f);
when(inputFile.charset()).thenReturn(Charsets.UTF_8); when(inputFile.charset()).thenReturn(Charsets.UTF_8);
when(inputFile.lines()).thenReturn(newSource.split("\n").length);
when(inputFile.lines()).thenReturn(StringUtils.countMatches(newSource, "\n") + 1);
FileUtils.write(f, newSource, Charsets.UTF_8); FileUtils.write(f, newSource, Charsets.UTF_8);
when(inputFile.key()).thenReturn("foo:Action.java"); when(inputFile.key()).thenReturn("foo:Action.java");
when(inputPathCache.getFile("foo", "Action.java")).thenReturn(inputFile); when(inputPathCache.getFile("foo", "Action.java")).thenReturn(inputFile);

+ 3
- 1
sonar-batch/src/test/java/org/sonar/batch/issue/tracking/IssueTrackingTest.java View File

import com.google.common.io.Resources; import com.google.common.io.Resources;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.junit.Before; import org.junit.Before;
import org.junit.Rule; import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
DefaultInputFile inputFile = mock(DefaultInputFile.class); DefaultInputFile inputFile = mock(DefaultInputFile.class);
File f = temp.newFile(); File f = temp.newFile();
when(inputFile.path()).thenReturn(f.toPath()); when(inputFile.path()).thenReturn(f.toPath());
when(inputFile.file()).thenReturn(f);
when(inputFile.charset()).thenReturn(Charsets.UTF_8); when(inputFile.charset()).thenReturn(Charsets.UTF_8);
String data = load(newSource); String data = load(newSource);
when(inputFile.lines()).thenReturn(data.split("\n").length);
when(inputFile.lines()).thenReturn(StringUtils.countMatches(data, "\n") + 1);
FileUtils.write(f, data, Charsets.UTF_8); FileUtils.write(f, data, Charsets.UTF_8);
when(inputFile.key()).thenReturn("foo:Action.java"); when(inputFile.key()).thenReturn("foo:Action.java");
when(lastSnapshots.getLineHashes("foo:Action.java")).thenReturn(computeHexHashes(load(reference))); when(lastSnapshots.getLineHashes("foo:Action.java")).thenReturn(computeHexHashes(load(reference)));

+ 12
- 0
sonar-batch/src/test/java/org/sonar/batch/scan/filesystem/FileMetadataTest.java View File

import org.sonar.api.batch.AnalysisMode; import org.sonar.api.batch.AnalysisMode;


import java.io.File; import java.io.File;
import java.nio.charset.Charset;


import static org.apache.commons.codec.digest.DigestUtils.md5Hex; import static org.apache.commons.codec.digest.DigestUtils.md5Hex;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
assertThat(metadata.empty).isFalse(); assertThat(metadata.empty).isFalse();
} }


@Test
public void read_with_wrong_encoding() throws Exception {
File tempFile = temp.newFile();
FileUtils.write(tempFile, "marker´s\n", Charset.forName("cp1252"));

FileMetadata.Metadata metadata = new FileMetadata(mode).read(tempFile, Charsets.UTF_8);
assertThat(metadata.lines).isEqualTo(2);
assertThat(metadata.hash).isEqualTo(md5Hex("marker\ufffds\n"));
assertThat(metadata.originalLineOffsets).containsOnly(0, 9);
}

@Test @Test
public void non_ascii_utf_8() throws Exception { public void non_ascii_utf_8() throws Exception {
File tempFile = temp.newFile(); File tempFile = temp.newFile();

Loading…
Cancel
Save