From 6782a76dcf2e42fea32116a22d1c3ecb8d2975fa Mon Sep 17 00:00:00 2001 From: Duarte Meneses Date: Fri, 7 Apr 2017 14:20:09 +0200 Subject: [PATCH] SONAR-6100 Improve support of binary files and/or files with different encoding --- .../api/batch/fs/internal/FileMetadata.java | 52 +++---- .../batch/fs/internal/FileMetadataTest.java | 61 +++----- .../scan/filesystem/CharsetDetector.java | 132 ++++++++++++++++++ .../scan/filesystem/MetadataGenerator.java | 40 +----- .../IssueExclusionsRegexpScannerTest.java | 34 +++-- .../issue/tracking/SourceHashHolderTest.java | 3 + .../scan/filesystem/CharsetDetectorTest.java | 95 +++++++++++++ .../filesystem/MetadataGeneratorTest.java | 50 +++++-- .../file-with-single-regexp-last-line.txt | 6 +- .../file-with-single-regexp.txt | 4 +- 10 files changed, 347 insertions(+), 130 deletions(-) create mode 100644 sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java create mode 100644 sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java diff --git a/sonar-plugin-api/src/main/java/org/sonar/api/batch/fs/internal/FileMetadata.java b/sonar-plugin-api/src/main/java/org/sonar/api/batch/fs/internal/FileMetadata.java index 4ddcf1d8d4e..f444c26bd0d 100644 --- a/sonar-plugin-api/src/main/java/org/sonar/api/batch/fs/internal/FileMetadata.java +++ b/sonar-plugin-api/src/main/java/org/sonar/api/batch/fs/internal/FileMetadata.java @@ -21,8 +21,8 @@ package org.sonar.api.batch.fs.internal; import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.ByteBuffer; @@ -39,8 +39,6 @@ import javax.annotation.Nullable; import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.io.ByteOrderMark; -import org.apache.commons.io.input.BOMInputStream; import org.sonar.api.CoreProperties; import org.sonar.api.batch.ScannerSide; import org.sonar.api.batch.fs.InputFile; @@ -79,18 +77,18 @@ public class FileMetadata { private int nonBlankLines = 0; private boolean blankLine = true; boolean alreadyLoggedInvalidCharacter = false; - private final File file; + private final String filePath; private final Charset encoding; - LineCounter(File file, Charset encoding) { - this.file = file; + LineCounter(String filePath, Charset encoding) { + this.filePath = filePath; this.encoding = encoding; } @Override protected void handleAll(char c) { if (!alreadyLoggedInvalidCharacter && c == '\ufffd') { - LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", file, + LOG.warn("Invalid character encountered in file {} at line {} for encoding {}. Please fix file content or configure the encoding to be used using property '{}'.", filePath, lines, encoding, CoreProperties.ENCODING_PROPERTY); alreadyLoggedInvalidCharacter = true; } @@ -133,13 +131,13 @@ public class FileMetadata { private MessageDigest globalMd5Digest = DigestUtils.getMd5Digest(); private StringBuilder sb = new StringBuilder(); private final CharsetEncoder encoder; - private final File file; + private final String filePath; - public FileHashComputer(File f) { + public FileHashComputer(String filePath) { encoder = StandardCharsets.UTF_8.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); - file = f; + this.filePath = filePath; } @Override @@ -168,7 +166,7 @@ public class FileMetadata { globalMd5Digest.update(encoded.array(), 0, encoded.limit()); } } catch (CharacterCodingException e) { - throw new IllegalStateException("Error encoding line hash in file: " + file.getAbsolutePath(), e); + throw new IllegalStateException("Error encoding line hash in file: " + filePath, e); } } @@ -272,32 +270,32 @@ public class FileMetadata { * Compute hash of a file ignoring line ends differences. * Maximum performance is needed. */ - public Metadata readMetadata(File file, Charset encoding, @Nullable CharHandler otherHandler) { - LineCounter lineCounter = new LineCounter(file, encoding); - FileHashComputer fileHashComputer = new FileHashComputer(file); + public Metadata readMetadata(InputStream stream, Charset encoding, String filePath, @Nullable CharHandler otherHandler) { + LineCounter lineCounter = new LineCounter(filePath, encoding); + FileHashComputer fileHashComputer = new FileHashComputer(filePath); LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); if (otherHandler != null) { CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter, otherHandler}; - readFile(file, encoding, handlers); + readFile(stream, encoding, filePath, handlers); } else { CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter}; - readFile(file, encoding, handlers); + readFile(stream, encoding, filePath, handlers); } return new Metadata(lineCounter.lines(), lineCounter.nonBlankLines(), fileHashComputer.getHash(), lineOffsetCounter.getOriginalLineOffsets(), lineOffsetCounter.getLastValidOffset()); } - public Metadata readMetadata(File file, Charset encoding) { - return readMetadata(file, encoding, null); + public Metadata readMetadata(InputStream stream, Charset encoding, String filePath) { + return readMetadata(stream, encoding, filePath, null); } /** * For testing purpose */ public Metadata readMetadata(Reader reader) { - LineCounter lineCounter = new LineCounter(new File("fromString"), StandardCharsets.UTF_16); - FileHashComputer fileHashComputer = new FileHashComputer(new File("fromString")); + LineCounter lineCounter = new LineCounter("fromString", StandardCharsets.UTF_16); + FileHashComputer fileHashComputer = new FileHashComputer("fromString"); LineOffsetCounter lineOffsetCounter = new LineOffsetCounter(); CharHandler[] handlers = {lineCounter, fileHashComputer, lineOffsetCounter}; @@ -310,13 +308,11 @@ public class FileMetadata { lineOffsetCounter.getLastValidOffset()); } - public static void readFile(File file, Charset encoding, CharHandler[] handlers) { - try (BOMInputStream bomIn = new BOMInputStream(new FileInputStream(file), - ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); - Reader reader = new BufferedReader(new InputStreamReader(bomIn, encoding))) { + public static void readFile(InputStream stream, Charset encoding, String filePath, CharHandler[] handlers) { + try (Reader reader = new BufferedReader(new InputStreamReader(stream, encoding))) { read(reader, handlers); } catch (IOException e) { - throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", file.getAbsolutePath(), encoding), e); + throw new IllegalStateException(String.format("Fail to read file '%s' with encoding '%s'", filePath, encoding), e); } } @@ -376,6 +372,10 @@ public class FileMetadata { * Compute a MD5 hash of each line of the file after removing of all blank chars */ public static void computeLineHashesForIssueTracking(InputFile f, LineHashConsumer consumer) { - readFile(f.file(), f.charset(), new CharHandler[] {new LineHashComputer(consumer, f.file())}); + try { + readFile(f.inputStream(), f.charset(), f.absolutePath(), new CharHandler[] {new LineHashComputer(consumer, f.file())}); + } catch (IOException e) { + throw new IllegalStateException("Failed to compute line hashes for " + f.absolutePath(), e); + } } } diff --git a/sonar-plugin-api/src/test/java/org/sonar/api/batch/fs/internal/FileMetadataTest.java b/sonar-plugin-api/src/test/java/org/sonar/api/batch/fs/internal/FileMetadataTest.java index e0145f4c918..e8ad6c9a7f5 100644 --- a/sonar-plugin-api/src/test/java/org/sonar/api/batch/fs/internal/FileMetadataTest.java +++ b/sonar-plugin-api/src/test/java/org/sonar/api/batch/fs/internal/FileMetadataTest.java @@ -20,8 +20,10 @@ package org.sonar.api.batch.fs.internal; import java.io.File; +import java.io.FileInputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; + import javax.annotation.Nullable; import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.FileUtils; @@ -53,7 +55,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.touch(tempFile); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(1); assertThat(metadata.nonBlankLines()).isEqualTo(0); assertThat(metadata.hash()).isNotEmpty(); @@ -66,7 +68,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\r\nbar\r\nbaz", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(3); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz")); @@ -79,7 +81,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "marker´s\n", Charset.forName("cp1252")); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(2); assertThat(metadata.hash()).isEqualTo(md5Hex("marker\ufffds\n")); assertThat(metadata.originalLineOffsets()).containsOnly(0, 9); @@ -90,7 +92,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "föo\r\nbàr\r\n\u1D11Ebaßz\r\n", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(4); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("föo\nbàr\n\u1D11Ebaßz\n")); @@ -101,7 +103,7 @@ public class FileMetadataTest { public void non_ascii_utf_16() throws Exception { File tempFile = temp.newFile(); FileUtils.write(tempFile, "föo\r\nbàr\r\n\u1D11Ebaßz\r\n", StandardCharsets.UTF_16, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_16); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_16, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(4); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("föo\nbàr\n\u1D11Ebaßz\n".getBytes(StandardCharsets.UTF_8))); @@ -113,7 +115,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\nbar\nbaz", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(3); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz")); @@ -126,7 +128,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\nbar\nbaz\n", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(4); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz\n")); @@ -139,7 +141,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\rbar\rbaz", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(3); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz")); @@ -152,7 +154,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\rbar\rbaz\r", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(4); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz\n")); @@ -165,7 +167,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\nbar\r\nbaz\n", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(4); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz\n")); @@ -177,7 +179,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\n\n\nbar", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(4); assertThat(metadata.nonBlankLines()).isEqualTo(2); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\n\n\nbar")); @@ -189,7 +191,7 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "foo\nbar\r\nbaz", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(3); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz")); @@ -201,25 +203,13 @@ public class FileMetadataTest { File tempFile = temp.newFile(); FileUtils.write(tempFile, "\nfoo\nbar\r\nbaz", StandardCharsets.UTF_8, true); - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(tempFile), StandardCharsets.UTF_8, tempFile.getName()); assertThat(metadata.lines()).isEqualTo(4); assertThat(metadata.nonBlankLines()).isEqualTo(3); assertThat(metadata.hash()).isEqualTo(md5Hex("\nfoo\nbar\nbaz")); assertThat(metadata.originalLineOffsets()).containsOnly(0, 1, 5, 10); } - @Test - public void start_with_bom() throws Exception { - File tempFile = temp.newFile(); - FileUtils.write(tempFile, "\uFEFFfoo\nbar\r\nbaz", StandardCharsets.UTF_8, true); - - Metadata metadata = new FileMetadata().readMetadata(tempFile, StandardCharsets.UTF_8); - assertThat(metadata.lines()).isEqualTo(3); - assertThat(metadata.nonBlankLines()).isEqualTo(3); - assertThat(metadata.hash()).isEqualTo(md5Hex("foo\nbar\nbaz")); - assertThat(metadata.originalLineOffsets()).containsOnly(0, 4, 9); - } - @Test public void ignore_whitespace_when_computing_line_hashes() throws Exception { File tempFile = temp.newFile(); @@ -274,17 +264,6 @@ public class FileMetadataTest { }); } - @Test - public void should_throw_if_file_does_not_exist() throws Exception { - File tempFolder = temp.newFolder(); - File file = new File(tempFolder, "doesNotExist.txt"); - - thrown.expect(IllegalStateException.class); - thrown.expectMessage("Fail to read file '" + file.getAbsolutePath() + "' with encoding 'UTF-8'"); - - new FileMetadata().readMetadata(file, StandardCharsets.UTF_8); - } - @Test public void line_feed_is_included_into_hash() throws Exception { File file1 = temp.newFile(); @@ -297,9 +276,10 @@ public class FileMetadataTest { File file2 = temp.newFile(); FileUtils.write(file2, "foo\nbar", StandardCharsets.UTF_8, true); - String hash1 = new FileMetadata().readMetadata(file1, StandardCharsets.UTF_8).hash(); - String hash1a = new FileMetadata().readMetadata(file1a, StandardCharsets.UTF_8).hash(); - String hash2 = new FileMetadata().readMetadata(file2, StandardCharsets.UTF_8).hash(); + String hash1 = new FileMetadata().readMetadata(new FileInputStream(file1), StandardCharsets.UTF_8, file1.getName()).hash(); + String hash1a = new FileMetadata().readMetadata(new FileInputStream(file1a), StandardCharsets.UTF_8, file1a.getName()).hash(); + String hash2 = new FileMetadata().readMetadata(new FileInputStream(file2), StandardCharsets.UTF_8, file2.getName()).hash(); + assertThat(hash1).isEqualTo(hash1a); assertThat(hash1).isNotEqualTo(hash2); } @@ -308,7 +288,8 @@ public class FileMetadataTest { public void binary_file_with_unmappable_character() throws Exception { File woff = new File(this.getClass().getResource("glyphicons-halflings-regular.woff").toURI()); - Metadata metadata = new FileMetadata().readMetadata(woff, StandardCharsets.UTF_8); + Metadata metadata = new FileMetadata().readMetadata(new FileInputStream(woff), StandardCharsets.UTF_8, woff.getAbsolutePath()); + assertThat(metadata.lines()).isEqualTo(135); assertThat(metadata.nonBlankLines()).isEqualTo(133); assertThat(metadata.hash()).isNotEmpty(); diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java new file mode 100644 index 00000000000..ae6aa5532da --- /dev/null +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java @@ -0,0 +1,132 @@ +/* + * SonarQube + * Copyright (C) 2009-2017 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.scanner.scan.filesystem; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.LinkedHashSet; +import java.util.Set; + +import javax.annotation.CheckForNull; + +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.BOMInputStream; + +public class CharsetDetector { + private static final int BYTES_TO_DECODE = 512; + private Path filePath; + private BOMInputStream stream; + private Charset detectedCharset; + private Charset defaultEncoding; + + public CharsetDetector(Path filePath, Charset defaultEncoding) { + this.filePath = filePath; + this.defaultEncoding = defaultEncoding; + } + + public boolean run() { + try { + stream = createInputStream(filePath); + if (detectBOM()) { + return true; + } + + return detectCharset(); + } catch (IOException e) { + throw new IllegalStateException("Unable to read file " + filePath.toAbsolutePath().toString(), e); + } + } + + @CheckForNull + public Charset charset() { + assertRun(); + return detectedCharset; + } + + public InputStream inputStream() { + assertRun(); + return stream; + } + + private static BOMInputStream createInputStream(Path path) throws IOException { + BufferedInputStream bufferedStream = new BufferedInputStream(Files.newInputStream(path)); + return new BOMInputStream(bufferedStream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, + ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); + } + + private boolean detectBOM() throws IOException { + String charsetName = stream.getBOMCharsetName(); + if (charsetName != null) { + detectedCharset = Charset.forName(charsetName); + return true; + } + return false; + } + + @CheckForNull + private boolean detectCharset() throws IOException { + stream.mark(BYTES_TO_DECODE); + byte[] buf = new byte[BYTES_TO_DECODE]; + int len = IOUtils.read(stream, buf, 0, BYTES_TO_DECODE); + stream.reset(); + + Set charsets = new LinkedHashSet<>(); + charsets.add(defaultEncoding); + charsets.add(StandardCharsets.UTF_8); + charsets.add(Charset.defaultCharset()); + + for (Charset c : charsets) { + if (tryDecode(buf, len, c)) { + detectedCharset = c; + return true; + } + } + return false; + } + + private static boolean tryDecode(byte[] bytes, int len, Charset charset) throws IOException { + CharsetDecoder decoder = charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + try { + decoder.decode(ByteBuffer.wrap(bytes, 0, len)); + } catch (CharacterCodingException e) { + return false; + } + return true; + } + + private void assertRun() { + if (stream == null) { + throw new IllegalStateException("Charset detection did not run"); + } + } +} diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java index c04483cc784..f8b796688f7 100644 --- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java @@ -19,14 +19,8 @@ */ package org.sonar.scanner.scan.filesystem; -import com.google.common.annotations.VisibleForTesting; - -import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,6 +31,8 @@ import org.sonar.api.batch.fs.internal.FileMetadata; import org.sonar.api.batch.fs.internal.Metadata; import org.sonar.scanner.issue.ignore.scanner.IssueExclusionsLoader; +import com.google.common.annotations.VisibleForTesting; + class MetadataGenerator { private static final Logger LOG = LoggerFactory.getLogger(MetadataGenerator.class); @VisibleForTesting @@ -62,10 +58,13 @@ class MetadataGenerator { * It is an expensive computation, reading the entire file. */ public void setMetadata(final DefaultInputFile inputFile, Charset defaultEncoding) { + CharsetDetector detector = new CharsetDetector(inputFile.path(), defaultEncoding); try { - Charset charset = detectCharset(inputFile.path(), defaultEncoding); + detector.run(); + Charset charset = detector.charset(); + InputStream is = detector.inputStream(); inputFile.setCharset(charset); - Metadata metadata = fileMetadata.readMetadata(inputFile.file(), charset, exclusionsScanner.createCharHandlerFor(inputFile.key())); + Metadata metadata = fileMetadata.readMetadata(is, charset, inputFile.absolutePath(), exclusionsScanner.createCharHandlerFor(inputFile.key())); inputFile.setMetadata(metadata); inputFile.setStatus(statusDetection.status(inputModule.definition().getKeyWithBranch(), inputFile.relativePath(), metadata.hash())); LOG.debug("'{}' generated metadata {} with charset '{}'", inputFile.relativePath(), inputFile.type() == Type.TEST ? "as test " : "", charset); @@ -74,29 +73,4 @@ class MetadataGenerator { } } - /** - * @return charset detected from BOM in given file or given defaultCharset - * @throws IllegalStateException if an I/O error occurs - */ - private static Charset detectCharset(Path path, Charset defaultCharset) { - try (InputStream inputStream = Files.newInputStream(path)) { - byte[] bom = new byte[4]; - int n = inputStream.read(bom, 0, bom.length); - if ((n >= 3) && (bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { - return StandardCharsets.UTF_8; - } else if ((n >= 4) && (bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { - return UTF_32BE; - } else if ((n >= 4) && (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { - return UTF_32LE; - } else if ((n >= 2) && (bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { - return StandardCharsets.UTF_16BE; - } else if ((n >= 2) && (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { - return StandardCharsets.UTF_16LE; - } else { - return defaultCharset; - } - } catch (IOException e) { - throw new IllegalStateException("Unable to read file " + path.toAbsolutePath().toString(), e); - } - } } diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest.java index 37c2b957b65..e3a322e6938 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest.java @@ -31,7 +31,9 @@ import org.sonar.scanner.issue.ignore.pattern.PatternMatcher; import org.sonar.scanner.issue.ignore.scanner.IssueExclusionsRegexpScanner; import org.sonar.scanner.issue.ignore.scanner.IssueExclusionsLoader.DoubleRegexpMatcher; +import java.io.IOException; import java.net.URISyntaxException; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; @@ -72,10 +74,11 @@ public class IssueExclusionsRegexpScannerTest { javaFile = "org.sonar.test.MyFile"; regexpScanner = new IssueExclusionsRegexpScanner(javaFile, allFilePatterns, blockPatterns, patternMatcher); } - + @Test - public void shouldDetectPatternLastLine() throws URISyntaxException { - fileMetadata.readMetadata(getResource("file-with-single-regexp-last-line.txt").toFile(), UTF_8, regexpScanner); + public void shouldDetectPatternLastLine() throws URISyntaxException, IOException { + Path filePath = getResource("file-with-single-regexp-last-line.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); verify(patternMatcher, times(1)).addPatternToExcludeResource(javaFile); verifyNoMoreInteractions(patternMatcher); @@ -83,13 +86,16 @@ public class IssueExclusionsRegexpScannerTest { @Test public void shouldDoNothing() throws Exception { - fileMetadata.readMetadata(getResource("file-with-no-regexp.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-no-regexp.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); + verifyNoMoreInteractions(patternMatcher); } @Test public void shouldAddPatternToExcludeFile() throws Exception { - fileMetadata.readMetadata(getResource("file-with-single-regexp.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-single-regexp.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); verify(patternMatcher, times(1)).addPatternToExcludeResource(javaFile); verifyNoMoreInteractions(patternMatcher); @@ -97,7 +103,8 @@ public class IssueExclusionsRegexpScannerTest { @Test public void shouldAddPatternToExcludeFileEvenIfAlsoDoubleRegexps() throws Exception { - fileMetadata.readMetadata(getResource("file-with-single-regexp-and-double-regexp.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-single-regexp-and-double-regexp.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); Set lineRanges = new HashSet<>(); lineRanges.add(new LineRange(5, 26)); @@ -108,7 +115,8 @@ public class IssueExclusionsRegexpScannerTest { @Test public void shouldAddPatternToExcludeLines() throws Exception { - fileMetadata.readMetadata(getResource("file-with-double-regexp.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-double-regexp.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); Set lineRanges = new HashSet<>(); lineRanges.add(new LineRange(21, 25)); @@ -118,7 +126,8 @@ public class IssueExclusionsRegexpScannerTest { @Test public void shouldAddPatternToExcludeLinesTillTheEnd() throws Exception { - fileMetadata.readMetadata(getResource("file-with-double-regexp-unfinished.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-double-regexp-unfinished.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); Set lineRanges = new HashSet<>(); lineRanges.add(new LineRange(21, 34)); @@ -128,7 +137,8 @@ public class IssueExclusionsRegexpScannerTest { @Test public void shouldAddPatternToExcludeSeveralLineRanges() throws Exception { - fileMetadata.readMetadata(getResource("file-with-double-regexp-twice.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-double-regexp-twice.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); Set lineRanges = new HashSet<>(); lineRanges.add(new LineRange(21, 25)); @@ -139,7 +149,8 @@ public class IssueExclusionsRegexpScannerTest { @Test public void shouldAddPatternToExcludeLinesWithWrongOrder() throws Exception { - fileMetadata.readMetadata(getResource("file-with-double-regexp-wrong-order.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-double-regexp-wrong-order.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); Set lineRanges = new HashSet<>(); lineRanges.add(new LineRange(25, 35)); @@ -149,7 +160,8 @@ public class IssueExclusionsRegexpScannerTest { @Test public void shouldAddPatternToExcludeLinesWithMess() throws Exception { - fileMetadata.readMetadata(getResource("file-with-double-regexp-mess.txt").toFile(), UTF_8, regexpScanner); + Path filePath = getResource("file-with-double-regexp-mess.txt"); + fileMetadata.readMetadata(Files.newInputStream(filePath), UTF_8, filePath.toString(), regexpScanner); Set lineRanges = new HashSet<>(); lineRanges.add(new LineRange(21, 29)); diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/tracking/SourceHashHolderTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/tracking/SourceHashHolderTest.java index f2f783a6bb3..b0465a5ce40 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/tracking/SourceHashHolderTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/issue/tracking/SourceHashHolderTest.java @@ -21,6 +21,8 @@ package org.sonar.scanner.issue.tracking; import java.io.File; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; + import org.apache.commons.io.FileUtils; import org.junit.Before; import org.junit.Rule; @@ -59,6 +61,7 @@ public class SourceHashHolderTest { ioFile = temp.newFile(); when(file.file()).thenReturn(ioFile); when(file.path()).thenReturn(ioFile.toPath()); + when(file.inputStream()).thenAnswer(i -> Files.newInputStream(ioFile.toPath())); when(file.lines()).thenReturn(1); when(file.charset()).thenReturn(StandardCharsets.UTF_8); diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java new file mode 100644 index 00000000000..167e362201c --- /dev/null +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java @@ -0,0 +1,95 @@ +/* + * SonarQube + * Copyright (C) 2009-2017 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.scanner.scan.filesystem; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Random; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.junit.rules.TemporaryFolder; + +public class CharsetDetectorTest { + @Rule + public TemporaryFolder temp = new TemporaryFolder(); + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void should_detect_charset_from_BOM() { + Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/"); + + assertThat(detectCharset(basedir.resolve("without_BOM.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.US_ASCII); + assertThat(detectCharset(basedir.resolve("UTF-8.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_8); + assertThat(detectCharset(basedir.resolve("UTF-16BE.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_16BE); + assertThat(detectCharset(basedir.resolve("UTF-16LE.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_16LE); + assertThat(detectCharset(basedir.resolve("UTF-32BE.txt"), StandardCharsets.US_ASCII)).isEqualTo(MetadataGenerator.UTF_32BE); + assertThat(detectCharset(basedir.resolve("UTF-32LE.txt"), StandardCharsets.US_ASCII)).isEqualTo(MetadataGenerator.UTF_32LE); + } + + @Test + public void always_try_utf8() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) { + // UTF-16 can't read 1 byte only + writer.write("t"); + } + + Path filePath = temp.newFile().toPath(); + Files.write(filePath, out.toByteArray()); + assertThat(detectCharset(filePath, StandardCharsets.UTF_16)).isEqualByComparingTo(StandardCharsets.UTF_8); + + } + + @Test + public void fail_if_file_doesnt_exist() { + exception.expect(IllegalStateException.class); + exception.expectMessage("Unable to read file " + Paths.get("non_existing").toAbsolutePath()); + detectCharset(Paths.get("non_existing"), StandardCharsets.UTF_8); + } + + @Test + public void no_encoding_found() throws IOException { + Path filePath = temp.newFile().toPath(); + byte[] b = new byte[512]; + new Random().nextBytes(b); + Files.write(filePath, b); + + CharsetDetector detector = new CharsetDetector(filePath, StandardCharsets.UTF_8); + assertThat(detector.run()).isFalse(); + } + + private Charset detectCharset(Path file, Charset defaultEncoding) { + CharsetDetector detector = new CharsetDetector(file, defaultEncoding); + assertThat(detector.run()).isTrue(); + return detector.charset(); + } +} diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java index 67b455a9986..c552cf75610 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java @@ -19,6 +19,7 @@ */ package org.sonar.scanner.scan.filesystem; +import static org.apache.commons.codec.digest.DigestUtils.md5Hex; import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -69,29 +70,25 @@ public class MetadataGeneratorTest { public void should_detect_charset_from_BOM() { Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/"); - assertThat(createInputFileWithMetadata(generator, basedir, "without_BOM.txt").charset()) + assertThat(createInputFileWithMetadata(basedir.resolve("without_BOM.txt")).charset()) .isEqualTo(StandardCharsets.US_ASCII); - assertThat(createInputFileWithMetadata(generator, basedir, "UTF-8.txt").charset()) + assertThat(createInputFileWithMetadata(basedir.resolve("UTF-8.txt")).charset()) .isEqualTo(StandardCharsets.UTF_8); - assertThat(createInputFileWithMetadata(generator, basedir, "UTF-16BE.txt").charset()) + assertThat(createInputFileWithMetadata(basedir.resolve("UTF-16BE.txt")).charset()) .isEqualTo(StandardCharsets.UTF_16BE); - assertThat(createInputFileWithMetadata(generator, basedir, "UTF-16LE.txt").charset()) + assertThat(createInputFileWithMetadata(basedir.resolve("UTF-16LE.txt")).charset()) .isEqualTo(StandardCharsets.UTF_16LE); - assertThat(createInputFileWithMetadata(generator, basedir, "UTF-32BE.txt").charset()) + assertThat(createInputFileWithMetadata(basedir.resolve("UTF-32BE.txt")).charset()) .isEqualTo(MetadataGenerator.UTF_32BE); - assertThat(createInputFileWithMetadata(generator, basedir, "UTF-32LE.txt").charset()) + assertThat(createInputFileWithMetadata(basedir.resolve("UTF-32LE.txt")).charset()) .isEqualTo(MetadataGenerator.UTF_32LE); + } - try { - createInputFileWithMetadata(generator, basedir, "non_existing"); - Assert.fail(); - } catch (IllegalStateException e) { - assertThat(e.getMessage()).endsWith("Unable to read file " + basedir.resolve("non_existing").toAbsolutePath()); - assertThat(e.getCause()).isInstanceOf(IllegalStateException.class); - } + private DefaultInputFile createInputFileWithMetadata(Path filePath) { + return createInputFileWithMetadata(filePath.getParent(), filePath.getFileName().toString()); } - private DefaultInputFile createInputFileWithMetadata(MetadataGenerator generator, Path baseDir, String relativePath) { + private DefaultInputFile createInputFileWithMetadata(Path baseDir, String relativePath) { DefaultInputFile inputFile = new TestInputFileBuilder("struts", relativePath) .setModuleBaseDir(baseDir) .build(); @@ -99,6 +96,29 @@ public class MetadataGeneratorTest { return inputFile; } + @Test + public void start_with_bom() throws Exception { + Path tempFile = temp.newFile().toPath(); + FileUtils.write(tempFile.toFile(), "\uFEFFfoo\nbar\r\nbaz", StandardCharsets.UTF_8, true); + + DefaultInputFile inputFile = createInputFileWithMetadata(tempFile); + assertThat(inputFile.lines()).isEqualTo(3); + assertThat(inputFile.nonBlankLines()).isEqualTo(3); + assertThat(inputFile.hash()).isEqualTo(md5Hex("foo\nbar\nbaz")); + assertThat(inputFile.originalLineOffsets()).containsOnly(0, 4, 9); + } + + @Test + public void non_existing_file_should_throw_exception() { + try { + createInputFileWithMetadata(Paths.get(""), "non_existing"); + Assert.fail(); + } catch (IllegalStateException e) { + assertThat(e.getMessage()).endsWith("Unable to read file " + Paths.get("").resolve("non_existing").toAbsolutePath()); + assertThat(e.getCause()).isInstanceOf(IllegalStateException.class); + } + } + @Test public void complete_input_file() throws Exception { // file system @@ -111,7 +131,7 @@ public class MetadataGeneratorTest { when(statusDetection.status("foo", "src/main/java/foo/Bar.java", "6c1d64c0b3555892fe7273e954f6fb5a")) .thenReturn(InputFile.Status.ADDED); - InputFile inputFile = createInputFileWithMetadata(generator, baseDir, "src/main/java/foo/Bar.java"); + InputFile inputFile = createInputFileWithMetadata(baseDir, "src/main/java/foo/Bar.java"); assertThat(inputFile.type()).isEqualTo(InputFile.Type.MAIN); assertThat(inputFile.file()).isEqualTo(srcFile.toFile()); diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp-last-line.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp-last-line.txt index ef135ebc50c..88ad675955f 100644 --- a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp-last-line.txt +++ b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp-last-line.txt @@ -4,9 +4,6 @@ import com.google.common.collect.Sets; import java.util.Set; -/** - * @SONAR-IGNORE-ALL - */ public class LineRange { int from, to; @@ -30,4 +27,5 @@ public class LineRange { return lines; } -} \ No newline at end of file +} +// @SONAR-IGNORE-ALL \ No newline at end of file diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp.txt index 88ad675955f..ea1e7b07e2d 100644 --- a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp.txt +++ b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/issue/ignore/scanner/IssueExclusionsRegexpScannerTest/file-with-single-regexp.txt @@ -4,6 +4,9 @@ import com.google.common.collect.Sets; import java.util.Set; +/** + * @SONAR-IGNORE-ALL + */ public class LineRange { int from, to; @@ -28,4 +31,3 @@ public class LineRange { } } -// @SONAR-IGNORE-ALL \ No newline at end of file -- 2.39.5