@@ -19,6 +19,7 @@ | |||
*/ | |||
package org.sonar.batch.index; | |||
import org.apache.commons.codec.binary.Hex; | |||
import org.apache.commons.codec.digest.DigestUtils; | |||
import org.apache.ibatis.session.ResultContext; | |||
import org.apache.ibatis.session.ResultHandler; | |||
@@ -26,6 +27,8 @@ import org.sonar.api.batch.fs.InputFile; | |||
import org.sonar.api.batch.fs.internal.DefaultInputFile; | |||
import org.sonar.api.utils.System2; | |||
import org.sonar.batch.ProjectTree; | |||
import org.sonar.batch.scan.filesystem.FileMetadata; | |||
import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer; | |||
import org.sonar.batch.scan.filesystem.InputFileMetadata; | |||
import org.sonar.batch.scan.filesystem.InputPathCache; | |||
import org.sonar.core.persistence.DbSession; | |||
@@ -34,10 +37,9 @@ import org.sonar.core.source.db.FileSourceDto; | |||
import org.sonar.core.source.db.FileSourceMapper; | |||
import javax.annotation.CheckForNull; | |||
import javax.annotation.Nullable; | |||
import java.io.BufferedReader; | |||
import java.io.IOException; | |||
import java.nio.file.Files; | |||
import java.util.HashMap; | |||
import java.util.Map; | |||
@@ -98,7 +100,7 @@ public class SourcePersister implements ScanPersister { | |||
.setBinaryData(data) | |||
.setDataHash(dataHash) | |||
.setSrcHash(metadata.hash()) | |||
.setLineHashes(lineHashesAsMd5Hex(inputFile, metadata)) | |||
.setLineHashes(lineHashesAsMd5Hex(inputFile)) | |||
.setCreatedAt(system2.now()) | |||
.setUpdatedAt(system2.now()); | |||
mapper.insert(dto); | |||
@@ -110,7 +112,7 @@ public class SourcePersister implements ScanPersister { | |||
.setBinaryData(data) | |||
.setDataHash(dataHash) | |||
.setSrcHash(metadata.hash()) | |||
.setLineHashes(lineHashesAsMd5Hex(inputFile, metadata)) | |||
.setLineHashes(lineHashesAsMd5Hex(inputFile)) | |||
.setUpdatedAt(system2.now()); | |||
mapper.update(previousDto); | |||
session.commit(); | |||
@@ -119,34 +121,23 @@ public class SourcePersister implements ScanPersister { | |||
} | |||
@CheckForNull | |||
private String lineHashesAsMd5Hex(DefaultInputFile f, InputFileMetadata metadata) { | |||
private String lineHashesAsMd5Hex(DefaultInputFile f) { | |||
if (f.lines() == 0) { | |||
return null; | |||
} | |||
// A md5 string is 32 char long + '\n' = 33 | |||
StringBuilder result = new StringBuilder(f.lines() * (32 + 1)); | |||
final StringBuilder result = new StringBuilder(f.lines() * (32 + 1)); | |||
try { | |||
BufferedReader reader = Files.newBufferedReader(f.path(), f.charset()); | |||
StringBuilder sb = new StringBuilder(); | |||
for (int i = 0; i < f.lines(); i++) { | |||
String lineStr = reader.readLine(); | |||
lineStr = lineStr == null ? "" : lineStr; | |||
for (int j = 0; j < lineStr.length(); j++) { | |||
char c = lineStr.charAt(j); | |||
if (!Character.isWhitespace(c)) { | |||
sb.append(c); | |||
} | |||
} | |||
if (i > 0) { | |||
FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() { | |||
@Override | |||
public void consume(int lineIdx, @Nullable byte[] hash) { | |||
if (lineIdx > 0) { | |||
result.append("\n"); | |||
} | |||
result.append(sb.length() > 0 ? DigestUtils.md5Hex(sb.toString()) : ""); | |||
sb.setLength(0); | |||
result.append(hash != null ? Hex.encodeHexString(hash) : ""); | |||
} | |||
} catch (Exception e) { | |||
throw new IllegalStateException("Unable to compute line hashes of file " + f, e); | |||
} | |||
}); | |||
return result.toString(); | |||
} |
@@ -19,17 +19,16 @@ | |||
*/ | |||
package org.sonar.batch.issue.tracking; | |||
import com.google.common.base.Charsets; | |||
import com.google.common.collect.LinkedHashMultimap; | |||
import com.google.common.collect.Multimap; | |||
import org.apache.commons.codec.binary.Hex; | |||
import org.apache.commons.codec.digest.DigestUtils; | |||
import org.apache.commons.lang.ObjectUtils; | |||
import org.sonar.api.batch.fs.internal.DefaultInputFile; | |||
import org.sonar.batch.scan.filesystem.FileMetadata; | |||
import org.sonar.batch.scan.filesystem.FileMetadata.LineHashConsumer; | |||
import javax.annotation.Nullable; | |||
import java.io.BufferedReader; | |||
import java.nio.file.Files; | |||
import java.security.MessageDigest; | |||
import java.util.Collection; | |||
/** | |||
@@ -56,27 +55,14 @@ public final class FileHashes { | |||
} | |||
public static FileHashes create(DefaultInputFile f) { | |||
byte[][] hashes = new byte[f.lines()][]; | |||
try { | |||
BufferedReader reader = Files.newBufferedReader(f.path(), f.charset()); | |||
MessageDigest lineMd5Digest = DigestUtils.getMd5Digest(); | |||
StringBuilder sb = new StringBuilder(); | |||
for (int i = 0; i < f.lines(); i++) { | |||
String lineStr = reader.readLine(); | |||
if (lineStr != null) { | |||
for (int j = 0; j < lineStr.length(); j++) { | |||
char c = lineStr.charAt(j); | |||
if (!Character.isWhitespace(c)) { | |||
sb.append(c); | |||
} | |||
} | |||
} | |||
hashes[i] = sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null; | |||
sb.setLength(0); | |||
final byte[][] hashes = new byte[f.lines()][]; | |||
FileMetadata.computeLineHashesForIssueTracking(f, new LineHashConsumer() { | |||
@Override | |||
public void consume(int lineIdx, @Nullable byte[] hash) { | |||
hashes[lineIdx - 1] = hash; | |||
} | |||
} catch (Exception e) { | |||
throw new IllegalStateException("Unable to compute line hashes of file " + f, e); | |||
} | |||
}); | |||
int size = hashes.length; | |||
Multimap<String, Integer> linesByHash = LinkedHashMultimap.create(); |
@@ -25,10 +25,15 @@ import org.apache.commons.codec.binary.Hex; | |||
import org.apache.commons.codec.digest.DigestUtils; | |||
import org.apache.commons.io.ByteOrderMark; | |||
import org.apache.commons.io.input.BOMInputStream; | |||
import org.slf4j.Logger; | |||
import org.slf4j.LoggerFactory; | |||
import org.sonar.api.BatchComponent; | |||
import org.sonar.api.CoreProperties; | |||
import org.sonar.api.batch.AnalysisMode; | |||
import org.sonar.api.batch.fs.internal.DefaultInputFile; | |||
import javax.annotation.CheckForNull; | |||
import javax.annotation.Nullable; | |||
import java.io.BufferedReader; | |||
import java.io.File; | |||
@@ -47,6 +52,8 @@ import java.util.List; | |||
*/ | |||
public class FileMetadata implements BatchComponent { | |||
private static final Logger LOG = LoggerFactory.getLogger(FileMetadata.class); | |||
private static final char LINE_FEED = '\n'; | |||
private static final char CARRIAGE_RETURN = '\r'; | |||
private final AnalysisMode analysisMode; | |||
@@ -55,7 +62,7 @@ public class FileMetadata implements BatchComponent { | |||
this.analysisMode = analysisMode; | |||
} | |||
private abstract class CharHandler { | |||
private static abstract class CharHandler { | |||
void handleAll(char c) { | |||
} | |||
@@ -70,15 +77,28 @@ public class FileMetadata implements BatchComponent { | |||
} | |||
} | |||
private class LineCounter extends CharHandler { | |||
private static class LineCounter extends CharHandler { | |||
private boolean empty = true; | |||
private int lines = 1; | |||
private int nonBlankLines = 0; | |||
private boolean blankLine = true; | |||
boolean alreadyLoggedInvalidCharacter = false; | |||
private final File file; | |||
private final Charset encoding; | |||
LineCounter(File file, Charset encoding) { | |||
this.file = file; | |||
this.encoding = encoding; | |||
} | |||
@Override | |||
void handleAll(char c) { | |||
this.empty = false; | |||
if (!alreadyLoggedInvalidCharacter && c == '\ufffd') { | |||
LOG.warn("Invalid character encountered in file " + file + " at line " + lines | |||
+ " for encoding " + encoding + ". Please fix file content or configure the encoding to be used using property '" + CoreProperties.ENCODING_PROPERTY + "'."); | |||
alreadyLoggedInvalidCharacter = true; | |||
} | |||
} | |||
@Override | |||
@@ -117,10 +137,9 @@ public class FileMetadata implements BatchComponent { | |||
} | |||
} | |||
private class FileHashComputer extends CharHandler { | |||
private static class FileHashComputer extends CharHandler { | |||
private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest(); | |||
StringBuffer sb = new StringBuffer(); | |||
private StringBuffer sb = new StringBuffer(); | |||
@Override | |||
void handleIgnoreEoL(char c) { | |||
@@ -147,7 +166,38 @@ public class FileMetadata implements BatchComponent { | |||
} | |||
} | |||
private class LineOffsetCounter extends CharHandler { | |||
private static class LineHashComputer extends CharHandler { | |||
private final MessageDigest lineMd5Digest = DigestUtils.getMd5Digest(); | |||
private final StringBuffer sb = new StringBuffer(); | |||
private final LineHashConsumer consumer; | |||
private int line = 1; | |||
public LineHashComputer(LineHashConsumer consumer) { | |||
this.consumer = consumer; | |||
} | |||
@Override | |||
void handleIgnoreEoL(char c) { | |||
if (!Character.isWhitespace(c)) { | |||
sb.append(c); | |||
} | |||
} | |||
@Override | |||
void newLine() { | |||
consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null); | |||
sb.setLength(0); | |||
line++; | |||
} | |||
@Override | |||
void eof() { | |||
consumer.consume(line, sb.length() > 0 ? lineMd5Digest.digest(sb.toString().getBytes(Charsets.UTF_8)) : null); | |||
} | |||
} | |||
private static class LineOffsetCounter extends CharHandler { | |||
private int currentOriginalOffset = 0; | |||
private List<Integer> originalLineOffsets = new ArrayList<Integer>(); | |||
@@ -176,17 +226,21 @@ public class FileMetadata implements BatchComponent { | |||
* Maximum performance is needed. | |||
*/ | |||
Metadata read(File file, Charset encoding) { | |||
char c = (char) 0; | |||
LineCounter lineCounter = new LineCounter(); | |||
LineCounter lineCounter = new LineCounter(file, encoding); | |||
FileHashComputer fileHashComputer = new FileHashComputer(); | |||
LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); | |||
CharHandler[] handlers; | |||
if (analysisMode.isPreview()) { | |||
// No need to compute line offsets in preview mode since there is no syntax highlighting | |||
handlers = new CharHandler[] {lineCounter, fileHashComputer}; | |||
if (!analysisMode.isPreview()) { | |||
scanFile(file, encoding, lineCounter, fileHashComputer, lineOffsetCounter); | |||
} else { | |||
handlers = new CharHandler[] {lineCounter, fileHashComputer, lineOffsetCounter}; | |||
// No need to compute line offsets in preview mode since there is no syntax highlighting | |||
scanFile(file, encoding, lineCounter, fileHashComputer); | |||
} | |||
return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), | |||
lineCounter.isEmpty()); | |||
} | |||
private static void scanFile(File file, Charset encoding, CharHandler... handlers) { | |||
char c = (char) 0; | |||
try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file), | |||
ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); | |||
Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) { | |||
@@ -224,9 +278,6 @@ public class FileMetadata implements BatchComponent { | |||
for (CharHandler handler : handlers) { | |||
handler.eof(); | |||
} | |||
return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), | |||
lineCounter.isEmpty()); | |||
} catch (IOException e) { | |||
throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e); | |||
} | |||
@@ -247,4 +298,17 @@ public class FileMetadata implements BatchComponent { | |||
this.originalLineOffsets = Ints.toArray(originalLineOffsets); | |||
} | |||
} | |||
public static interface LineHashConsumer { | |||
void consume(int lineIdx, @Nullable byte[] hash); | |||
} | |||
/** | |||
* Compute a MD5 hash of each line of the file after removing of all blank chars | |||
*/ | |||
public static void computeLineHashesForIssueTracking(DefaultInputFile f, LineHashConsumer consumer) { | |||
scanFile(f.file(), f.charset(), new LineHashComputer(consumer)); | |||
} | |||
} |
@@ -22,6 +22,7 @@ package org.sonar.batch.issue.tracking; | |||
import com.google.common.base.Charsets; | |||
import org.apache.commons.codec.digest.DigestUtils; | |||
import org.apache.commons.io.FileUtils; | |||
import org.apache.commons.lang.StringUtils; | |||
import org.junit.Before; | |||
import org.junit.Test; | |||
import org.junit.rules.TemporaryFolder; | |||
@@ -224,8 +225,9 @@ public class IssueTrackingDecoratorTest { | |||
DefaultInputFile inputFile = mock(DefaultInputFile.class); | |||
java.io.File f = temp.newFile(); | |||
when(inputFile.path()).thenReturn(f.toPath()); | |||
when(inputFile.file()).thenReturn(f); | |||
when(inputFile.charset()).thenReturn(Charsets.UTF_8); | |||
when(inputFile.lines()).thenReturn(newSource.split("\n").length); | |||
when(inputFile.lines()).thenReturn(StringUtils.countMatches(newSource, "\n") + 1); | |||
FileUtils.write(f, newSource, Charsets.UTF_8); | |||
when(inputFile.key()).thenReturn("foo:Action.java"); | |||
when(inputPathCache.getFile("foo", "Action.java")).thenReturn(inputFile); |
@@ -25,6 +25,7 @@ import com.google.common.collect.Lists; | |||
import com.google.common.io.Resources; | |||
import org.apache.commons.codec.digest.DigestUtils; | |||
import org.apache.commons.io.FileUtils; | |||
import org.apache.commons.lang.StringUtils; | |||
import org.junit.Before; | |||
import org.junit.Rule; | |||
import org.junit.Test; | |||
@@ -354,9 +355,10 @@ public class IssueTrackingTest { | |||
DefaultInputFile inputFile = mock(DefaultInputFile.class); | |||
File f = temp.newFile(); | |||
when(inputFile.path()).thenReturn(f.toPath()); | |||
when(inputFile.file()).thenReturn(f); | |||
when(inputFile.charset()).thenReturn(Charsets.UTF_8); | |||
String data = load(newSource); | |||
when(inputFile.lines()).thenReturn(data.split("\n").length); | |||
when(inputFile.lines()).thenReturn(StringUtils.countMatches(data, "\n") + 1); | |||
FileUtils.write(f, data, Charsets.UTF_8); | |||
when(inputFile.key()).thenReturn("foo:Action.java"); | |||
when(lastSnapshots.getLineHashes("foo:Action.java")).thenReturn(computeHexHashes(load(reference))); |
@@ -28,6 +28,7 @@ import org.junit.rules.TemporaryFolder; | |||
import org.sonar.api.batch.AnalysisMode; | |||
import java.io.File; | |||
import java.nio.charset.Charset; | |||
import static org.apache.commons.codec.digest.DigestUtils.md5Hex; | |||
import static org.assertj.core.api.Assertions.assertThat; | |||
@@ -69,6 +70,17 @@ public class FileMetadataTest { | |||
assertThat(metadata.empty).isFalse(); | |||
} | |||
@Test | |||
public void read_with_wrong_encoding() throws Exception { | |||
File tempFile = temp.newFile(); | |||
FileUtils.write(tempFile, "marker´s\n", Charset.forName("cp1252")); | |||
FileMetadata.Metadata metadata = new FileMetadata(mode).read(tempFile, Charsets.UTF_8); | |||
assertThat(metadata.lines).isEqualTo(2); | |||
assertThat(metadata.hash).isEqualTo(md5Hex("marker\ufffds\n")); | |||
assertThat(metadata.originalLineOffsets).containsOnly(0, 9); | |||
} | |||
@Test | |||
public void non_ascii_utf_8() throws Exception { | |||
File tempFile = temp.newFile(); |