aboutsummaryrefslogtreecommitdiffstats
path: root/sonar-scanner-engine/src
diff options
context:
space:
mode:
Diffstat (limited to 'sonar-scanner-engine/src')
-rw-r--r--sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java111
-rw-r--r--sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java90
-rw-r--r--sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java289
-rw-r--r--sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java13
-rw-r--r--sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java135
-rw-r--r--sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java57
-rw-r--r--sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java183
-rw-r--r--sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java10
-rw-r--r--sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txtbin40 -> 36 bytes
9 files changed, 803 insertions, 85 deletions
diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java
new file mode 100644
index 00000000000..e0d7fecf320
--- /dev/null
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java
@@ -0,0 +1,111 @@
+/*
+ * SonarQube
+ * Copyright (C) 2009-2017 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+package org.sonar.scanner.scan.filesystem;
+
+import static java.nio.charset.StandardCharsets.UTF_16;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+
+import javax.annotation.CheckForNull;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.sonar.scanner.scan.filesystem.CharsetValidation.Result;
+import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation;
+
+public class ByteCharsetDetector {
+ // these needs to be sorted by longer first!
+ private static final ByteOrderMark[] boms = {ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE,
+ ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE};
+
+ private Charset userConfiguration;
+ private CharsetValidation validator;
+
+ public ByteCharsetDetector(CharsetValidation validator, Charset userConfiguration) {
+ this.validator = validator;
+ this.userConfiguration = userConfiguration;
+ }
+
+ @CheckForNull
+ public Charset detect(byte[] buf) {
+ // Try UTF-8 first since we are very confident in it if it's a yes.
+ // Fail if we see nulls to not have FPs if the text is ASCII encoded in UTF-16.
+ Result utf8Result = validator.isUTF8(buf, true);
+ if (utf8Result.valid() == Validation.YES) {
+ return utf8Result.charset();
+ } else if (utf8Result.valid() == Validation.MAYBE) {
+ return detectAscii(buf);
+ }
+
+ // try UTF16 with both endiness. Fail if we see nulls to not have FPs if it's UTF-32.
+ Result utf16 = validator.isUTF16(buf, true);
+ if (utf16.valid() == Validation.YES && validator.isValidUTF16(buf, UTF_16LE.equals(utf16.charset()))) {
+ return utf16.charset();
+ }
+
+ // at this point we know it can't be UTF-8
+ Charset c = userConfiguration;
+ if (!UTF_8.equals(c) && (!isUtf16(c) || utf16.valid() == Validation.MAYBE) && validator.tryDecode(buf, c)) {
+ return c;
+ }
+
+ return null;
+ }
+
+ private Charset detectAscii(byte[] buf) {
+ if (!isUtf16Or32(userConfiguration) && validator.tryDecode(buf, userConfiguration)) {
+ return userConfiguration;
+ }
+
+ return null;
+ }
+
+ private static boolean isUtf16(Charset charset) {
+ return UTF_16.equals(charset) || UTF_16BE.equals(charset) || UTF_16LE.equals(charset);
+ }
+
+ private static boolean isUtf16Or32(Charset charset) {
+ return isUtf16(charset) || MetadataGenerator.UTF_32BE.equals(charset) || MetadataGenerator.UTF_32LE.equals(charset);
+ }
+
+ @CheckForNull
+ public ByteOrderMark detectBOM(byte[] buffer) {
+ return Arrays.stream(boms)
+ .filter(b -> isBom(b, buffer))
+ .findAny()
+ .orElse(null);
+ }
+
+ private static boolean isBom(ByteOrderMark bom, byte[] buffer) {
+ if (buffer.length < bom.length()) {
+ return false;
+ }
+ for (int i = 0; i < bom.length(); i++) {
+ if ((byte) bom.get(i) != buffer[i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+}
diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java
index 3586ea9f171..a2eb629c1a7 100644
--- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java
@@ -22,51 +22,38 @@ package org.sonar.scanner.scan.filesystem;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
-import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.util.LinkedHashSet;
-import java.util.Set;
+import java.util.Arrays;
+
+import javax.annotation.CheckForNull;
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.BOMInputStream;
public class CharsetDetector {
- private static final int BYTES_TO_DECODE = 512;
+ private static final int BYTES_TO_DECODE = 4192;
private Path filePath;
- private BOMInputStream stream;
+ private BufferedInputStream stream;
private Charset detectedCharset;
- private Charset defaultEncoding;
+ private Charset userEncoding;
- public CharsetDetector(Path filePath, Charset defaultEncoding) {
+ public CharsetDetector(Path filePath, Charset userEncoding) {
this.filePath = filePath;
- this.defaultEncoding = defaultEncoding;
+ this.userEncoding = userEncoding;
}
public boolean run() {
try {
- stream = createInputStream(filePath);
- if (detectBOM()) {
- return true;
- }
-
- if (detectCharset()) {
- return true;
- }
-
- detectedCharset = defaultEncoding;
- return false;
+ byte[] buf = readBuffer();
+ return detectCharset(buf);
} catch (IOException e) {
throw new IllegalStateException("Unable to read file " + filePath.toAbsolutePath().toString(), e);
}
}
+ @CheckForNull
public Charset charset() {
assertRun();
return detectedCharset;
@@ -77,52 +64,27 @@ public class CharsetDetector {
return stream;
}
- private static BOMInputStream createInputStream(Path path) throws IOException {
- BufferedInputStream bufferedStream = new BufferedInputStream(Files.newInputStream(path));
- return new BOMInputStream(bufferedStream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE,
- ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE);
- }
-
- private boolean detectBOM() throws IOException {
- String charsetName = stream.getBOMCharsetName();
- if (charsetName != null) {
- detectedCharset = Charset.forName(charsetName);
- return true;
- }
- return false;
- }
-
- private boolean detectCharset() throws IOException {
+ private byte[] readBuffer() throws IOException {
+ stream = new BufferedInputStream(Files.newInputStream(filePath), BYTES_TO_DECODE * 2);
stream.mark(BYTES_TO_DECODE);
byte[] buf = new byte[BYTES_TO_DECODE];
- int len = IOUtils.read(stream, buf, 0, BYTES_TO_DECODE);
+ int read = IOUtils.read(stream, buf, 0, BYTES_TO_DECODE);
stream.reset();
-
- Set<Charset> charsets = new LinkedHashSet<>();
- charsets.add(defaultEncoding);
- charsets.add(StandardCharsets.UTF_8);
- charsets.add(Charset.defaultCharset());
-
- for (Charset c : charsets) {
- if (tryDecode(buf, len, c)) {
- detectedCharset = c;
- return true;
- }
- }
- return false;
+ stream.mark(-1);
+ return Arrays.copyOf(buf, read);
}
- private static boolean tryDecode(byte[] bytes, int len, Charset charset) throws IOException {
- CharsetDecoder decoder = charset.newDecoder()
- .onMalformedInput(CodingErrorAction.REPORT)
- .onUnmappableCharacter(CodingErrorAction.REPORT);
-
- try {
- decoder.decode(ByteBuffer.wrap(bytes, 0, len));
- } catch (CharacterCodingException e) {
- return false;
+ private boolean detectCharset(byte[] buf) throws IOException {
+ ByteCharsetDetector detector = new ByteCharsetDetector(new CharsetValidation(), userEncoding);
+ ByteOrderMark bom = detector.detectBOM(buf);
+ if (bom != null) {
+ detectedCharset = Charset.forName(bom.getCharsetName());
+ stream.skip(bom.length());
+ return true;
}
- return true;
+
+ detectedCharset = detector.detect(buf);
+ return detectedCharset != null;
}
private void assertRun() {
diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java
new file mode 100644
index 00000000000..8dd8f44240d
--- /dev/null
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java
@@ -0,0 +1,289 @@
+/*
+ * SonarQube
+ * Copyright (C) 2009-2017 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+package org.sonar.scanner.scan.filesystem;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+
+import javax.annotation.CheckForNull;
+import javax.annotation.Nullable;
+
+public class CharsetValidation {
+
+ private static final float UTF_16_NULL_PASS_THRESHOLD = 0.7f;
+ private static final float UTF_16_NULL_FAIL_THRESHOLD = 0.1f;
+
+ /**
+ * Checks if an array of bytes looks UTF-16 encoded.
+ * We look for clues by checking the presence of nulls and new line control chars in both little and big endian byte orders.
+ * Failing on nulls will greatly reduce FPs if the buffer is actually encoded in UTF-32.
+ *
+ * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are
+ * within this range, we look for number of nulls and see if it is above a certain threshold.
+ * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike.
+ * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance
+ * for opposite nulls (10%).
+ *
+ * Line feed code point (0x000A) reversed would be (0x0A00). This code point is reserved and should never be found.
+ *
+ */
+ public Result isUTF16(byte[] buffer, boolean failOnNull) {
+ if (buffer.length < 2) {
+ return Result.INVALID;
+ }
+
+ int beAscii = 0;
+ int beLines = 0;
+ int leAscii = 0;
+ int leLines = 0;
+
+ for (int i = 0; i < buffer.length / 2; i++) {
+ // using bytes is fine, since we will compare with positive numbers only
+ byte c1 = buffer[i * 2];
+ byte c2 = buffer[i * 2 + 1];
+
+ if (c1 == 0) {
+ if (c2 != 0) {
+ if (c2 == 0x0a || c2 == 0x0d) {
+ beLines++;
+ }
+ beAscii++;
+ } else if (failOnNull) {
+ // it's probably UTF-32 or binary
+ return Result.INVALID;
+ }
+ } else if (c2 == 0) {
+ leAscii++;
+ if (c1 == 0x0a || c1 == 0x0d) {
+ leLines++;
+ }
+ }
+ }
+
+ float beAsciiPerc = beAscii * 2.0f / (float) buffer.length;
+ float leAsciiPerc = leAscii * 2.0f / (float) buffer.length;
+
+ if (leLines == 0) {
+ // could be BE
+ if (beAsciiPerc >= UTF_16_NULL_PASS_THRESHOLD && leAsciiPerc < UTF_16_NULL_FAIL_THRESHOLD) {
+ return Result.newValid(StandardCharsets.UTF_16BE);
+ }
+ if (beLines > 0) {
+ // this gives FPs for UTF-32 if !failOnNull
+ return Result.newValid(StandardCharsets.UTF_16BE);
+ }
+ } else if (beLines > 0) {
+ // lines detected with both endiness -> can't be utf-16
+ return Result.INVALID;
+ }
+ if (beLines == 0) {
+ // could be BE
+ if (leAsciiPerc >= UTF_16_NULL_PASS_THRESHOLD && beAsciiPerc < UTF_16_NULL_FAIL_THRESHOLD) {
+ return Result.newValid(StandardCharsets.UTF_16LE);
+ }
+ if (leLines > 0) {
+ // this gives FPs for UTF-32 if !failOnNull
+ return Result.newValid(StandardCharsets.UTF_16LE);
+ }
+ }
+
+ // if we reach here, means that there wasn't a line feed for a single endiness and we didn't see a strong null pattern for any of the
+ // endiness.
+ // It could happen if there are no line feeds in the text and it's a language that does not use ANSI (unicode > 255).
+ return new Result(Validation.MAYBE, null);
+ }
+
+ /**
+ * Checks whether it's a valid UTF-16-encoded buffer.
+ * Most sequences of bytes of any encoding will be valid UTF-16, so this is not very effective and gives
+ * often false positives.
+ *
+ * Possible 16bit values in UTF-16:
+ *
+ * 0x0000-0xD7FF: single 16bit block
+ * 0xD800-0xDBFF: first block
+ * 0xDC00-0xDFFF: second block
+ * 0XE000-0xFFFF: single 16 bit block
+ *
+ * The following UTF code points get mapped into 1 or 2 blocks:
+ * 0x0000 -0xD7FF (0 -55295) : 2 bytes, direct mapping
+ * 0xE000 -0xFFFF (57344-65535) : 2 bytes, direct mapping
+ * 0x10000-0x10FFFF (65536-1114111): 2 blocks of 2 bytes (not direct..)
+ *
+ * Note that Unicode 55296-57345 (0xD800 to 0xDFFF) are not used, since it's reserved and used in UTF-16 for the high/low surrogates.
+ *
+ * We reject 2-byte blocks with 0 (we consider it's binary) even though it's a valid UTF-16 encoding.
+ *
+ */
+ public boolean isValidUTF16(byte[] buffer) {
+ return isValidUTF16(buffer, false);
+ }
+
+ public boolean isValidUTF16(byte[] buffer, boolean le) {
+ if (buffer.length < 2) {
+ return false;
+ }
+ for (int i = 0; i < buffer.length / 2; i++) {
+ boolean extraByte = false;
+ int c = read16bit(buffer, i, le);
+
+ if (c >= 0xD800 && c < 0xDC00) {
+ // it's a higher surrogate (10 bits)
+ extraByte = true;
+ i++;
+ } else if ((c >= 0xDC00 && c < 0xE000) || c == 0) {
+ return false;
+ }
+ // else it is a simple 2 byte encoding (code points in BMP), and it's valid
+
+ if (extraByte && i < buffer.length / 2) {
+ c = read16bit(buffer, i, le);
+ if (c < 0xDC00 || c >= 0xE000) {
+ // invalid lower surrogate (10 bits)
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Checks if a buffer contains only valid UTF8 encoded bytes.
+ * It's very effective, giving a clear YES/NO, unless it's ASCII (unicode < 127), in which case it returns MAYBE.
+ *
+ *
+ * First byte:
+ * 0xxxxxxx: only one byte (0-127)
+ * 110xxxxx: 2 bytes (194-223, as 192/193 are invalid)
+ * 1110xxxx: 3 bytes (224-239)
+ * 11110xxx: 4 bytes (240-244)
+ *
+ * Bytes 2,3 and 4 are always 10xxxxxx (0x80-0xBF or 128-191).
+ *
+ * So depending on the number of significant bits in the unicode code point, the length will be 1,2,3 or 4 bytes:
+ * 0 -7 bits (0x0000-007F): 1 byte encoding
+ * 8 -11 bits (0x0080-07FF): 2 bytes encoding
+ * 12-16 bits (0x0800-FFFF): 3 bytes encoding
+ * 17-21 bits (0x10000-10FFFF): 4 bytes encoding
+ */
+ public Result isUTF8(byte[] buffer, boolean rejectNulls) {
+ boolean onlyAscii = true;
+
+ for (int i = 0; i < buffer.length; i++) {
+ byte len;
+ // make it unsigned for the comparisons
+ int c = (0xFF) & buffer[i];
+
+ if (rejectNulls && c == 0) {
+ return Result.INVALID;
+ }
+ if ((c & 0b10000000) == 0) {
+ len = 0;
+ } else if (c >= 194 && c < 224) {
+ len = 1;
+ } else if ((c & 0b11110000) == 0b11100000) {
+ len = 2;
+ } else if ((c & 0b11111000) == 0b11110000) {
+ len = 3;
+ } else {
+ return Result.INVALID;
+ }
+
+ while (len > 0) {
+ i++;
+ if (i >= buffer.length) {
+ break;
+ }
+ c = (0xFF) & buffer[i];
+ onlyAscii = false;
+
+ // first 2 bits should be 10
+ if ((c & 0b11000000) != 0b10000000) {
+ return Result.INVALID;
+ }
+ len--;
+ }
+ }
+
+ return onlyAscii ? new Result(Validation.MAYBE, StandardCharsets.UTF_8) : Result.newValid(StandardCharsets.UTF_8);
+ }
+
+ /**
+ * Tries to use the given charset to decode the byte array.
+ * @return true if decoding succeeded, false if there was a decoding error.
+ */
+ public boolean tryDecode(byte[] bytes, @Nullable Charset charset) {
+ if (charset == null) {
+ return false;
+ }
+ CharsetDecoder decoder = charset.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+
+ try {
+ decoder.decode(ByteBuffer.wrap(bytes));
+ } catch (CharacterCodingException e) {
+ return false;
+ }
+ return true;
+ }
+
+ private static int read16bit(byte[] buffer, int i, boolean le) {
+ return le ? (buffer[i / 2] & 0xff) | ((buffer[i / 2 + 1] & 0xff) << 8)
+ : ((buffer[i / 2] & 0xff) << 8) | (buffer[i / 2 + 1] & 0xff);
+ }
+
+ public enum Validation {
+ NO,
+ YES,
+ MAYBE
+ }
+
+ public static class Result {
+ static final Result INVALID = new Result(Validation.NO, null);
+ private Validation valid;
+ private Charset charset;
+
+ public Result(Validation valid, @Nullable Charset charset) {
+ this.valid = valid;
+ this.charset = charset;
+ }
+
+ public static Result newValid(Charset charset) {
+ return new Result(Validation.YES, charset);
+ }
+
+ public Validation valid() {
+ return valid;
+ }
+
+ /**
+ * Only non-null if Valid.Yes
+ */
+ @CheckForNull
+ public Charset charset() {
+ return charset;
+ }
+ }
+}
diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java
index f8b796688f7..6c408522a09 100644
--- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java
@@ -58,11 +58,16 @@ class MetadataGenerator {
* It is an expensive computation, reading the entire file.
*/
public void setMetadata(final DefaultInputFile inputFile, Charset defaultEncoding) {
- CharsetDetector detector = new CharsetDetector(inputFile.path(), defaultEncoding);
+ CharsetDetector charsetDetector = new CharsetDetector(inputFile.path(), defaultEncoding);
try {
- detector.run();
- Charset charset = detector.charset();
- InputStream is = detector.inputStream();
+ Charset charset;
+ if (charsetDetector.run()) {
+ charset = charsetDetector.charset();
+ } else {
+ LOG.debug("Failed to detect a valid charset for file '{}'. Using default charset.", inputFile.relativePath());
+ charset = defaultEncoding;
+ }
+ InputStream is = charsetDetector.inputStream();
inputFile.setCharset(charset);
Metadata metadata = fileMetadata.readMetadata(is, charset, inputFile.absolutePath(), exclusionsScanner.createCharHandlerFor(inputFile.key()));
inputFile.setMetadata(metadata);
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java
new file mode 100644
index 00000000000..c1da6f08a24
--- /dev/null
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java
@@ -0,0 +1,135 @@
+/*
+ * SonarQube
+ * Copyright (C) 2009-2017 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+package org.sonar.scanner.scan.filesystem;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyBoolean;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.apache.commons.io.ByteOrderMark;
+import org.junit.Before;
+import org.junit.Test;
+import org.sonar.scanner.scan.filesystem.CharsetValidation.Result;
+import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation;
+
+public class ByteCharsetDetectorTest {
+ private CharsetValidation validation;
+ private ByteCharsetDetector charsets;
+
+ @Before
+ public void setUp() {
+ validation = mock(CharsetValidation.class);
+ charsets = new ByteCharsetDetector(validation, null);
+ }
+
+ @Test
+ public void detectBOM() throws URISyntaxException, IOException {
+ byte[] b = ByteOrderMark.UTF_16BE.getBytes();
+ assertThat(charsets.detectBOM(b)).isEqualTo(ByteOrderMark.UTF_16BE);
+
+ assertThat(charsets.detectBOM(readFile("UTF-8"))).isEqualTo(ByteOrderMark.UTF_8);
+ assertThat(charsets.detectBOM(readFile("UTF-16BE"))).isEqualTo(ByteOrderMark.UTF_16BE);
+ assertThat(charsets.detectBOM(readFile("UTF-16LE"))).isEqualTo(ByteOrderMark.UTF_16LE);
+ assertThat(charsets.detectBOM(readFile("UTF-32BE"))).isEqualTo(ByteOrderMark.UTF_32BE);
+ assertThat(charsets.detectBOM(readFile("UTF-32LE"))).isEqualTo(ByteOrderMark.UTF_32LE);
+ }
+
+ private byte[] readFile(String fileName) throws URISyntaxException, IOException {
+ Path path = Paths.get(this.getClass().getClassLoader().getResource("org/sonar/scanner/scan/filesystem/" + fileName + ".txt").toURI());
+ return Files.readAllBytes(path);
+ }
+
+ @Test
+ public void tryUTF8First() {
+ when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_8));
+ assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.UTF_8);
+ }
+
+ @Test
+ public void tryUTF16heuristics() {
+ when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID);
+ when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_16));
+ when(validation.isValidUTF16(any(byte[].class), anyBoolean())).thenReturn(true);
+
+ assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.UTF_16);
+ }
+
+ @Test
+ public void failAll() {
+ when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID);
+ when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null));
+
+ assertThat(charsets.detect(new byte[1])).isEqualTo(null);
+ }
+
+ @Test
+ public void failAnsii() {
+ when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null));
+ when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_16));
+ when(validation.isValidUTF16(any(byte[].class), anyBoolean())).thenReturn(true);
+
+ assertThat(charsets.detect(new byte[1])).isEqualTo(null);
+ }
+
+ @Test
+ public void tryUserAnsii() {
+ when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null));
+ when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_16));
+ when(validation.isValidUTF16(any(byte[].class), anyBoolean())).thenReturn(true);
+ when(validation.tryDecode(any(byte[].class), eq(StandardCharsets.ISO_8859_1))).thenReturn(true);
+
+ charsets = new ByteCharsetDetector(validation, StandardCharsets.ISO_8859_1);
+ assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.ISO_8859_1);
+ }
+
+ @Test
+ public void tryOtherUserCharset() {
+ when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID);
+ when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null));
+ when(validation.tryDecode(any(byte[].class), eq(StandardCharsets.ISO_8859_1))).thenReturn(true);
+
+ charsets = new ByteCharsetDetector(validation, StandardCharsets.ISO_8859_1);
+ assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.ISO_8859_1);
+ }
+
+ @Test
+ public void invalidBOM() {
+ byte[] b1 = {(byte) 0xFF, (byte) 0xFF};
+ assertThat(charsets.detectBOM(b1)).isNull();
+
+ // not enough bytes
+ byte[] b2 = {(byte) 0xFE};
+ assertThat(charsets.detectBOM(b2)).isNull();
+
+ // empty
+ byte[] b3 = new byte[0];
+ assertThat(charsets.detectBOM(b3)).isNull();
+ }
+}
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java
index 5abdb43764a..aeb38ee7993 100644
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java
@@ -19,18 +19,25 @@
*/
package org.sonar.scanner.scan.filesystem;
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_16;
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.assertj.core.api.Assertions.assertThat;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
-import java.io.OutputStreamWriter;
+import java.io.InputStreamReader;
import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
+import java.util.List;
import java.util.Random;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
@@ -47,33 +54,42 @@ public class CharsetDetectorTest {
public void should_detect_charset_from_BOM() {
Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/");
- assertThat(detectCharset(basedir.resolve("without_BOM.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.US_ASCII);
- assertThat(detectCharset(basedir.resolve("UTF-8.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_8);
- assertThat(detectCharset(basedir.resolve("UTF-16BE.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_16BE);
- assertThat(detectCharset(basedir.resolve("UTF-16LE.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_16LE);
- assertThat(detectCharset(basedir.resolve("UTF-32BE.txt"), StandardCharsets.US_ASCII)).isEqualTo(MetadataGenerator.UTF_32BE);
- assertThat(detectCharset(basedir.resolve("UTF-32LE.txt"), StandardCharsets.US_ASCII)).isEqualTo(MetadataGenerator.UTF_32LE);
+ assertThat(detectCharset(basedir.resolve("without_BOM.txt"), US_ASCII)).isEqualTo(US_ASCII);
+ assertThat(detectCharset(basedir.resolve("UTF-8.txt"), US_ASCII)).isEqualTo(UTF_8);
+ assertThat(detectCharset(basedir.resolve("UTF-16BE.txt"), US_ASCII)).isEqualTo(UTF_16BE);
+ assertThat(detectCharset(basedir.resolve("UTF-16LE.txt"), US_ASCII)).isEqualTo(UTF_16LE);
+ assertThat(detectCharset(basedir.resolve("UTF-32BE.txt"), US_ASCII)).isEqualTo(MetadataGenerator.UTF_32BE);
+ assertThat(detectCharset(basedir.resolve("UTF-32LE.txt"), US_ASCII)).isEqualTo(MetadataGenerator.UTF_32LE);
+ }
+
+ @Test
+ public void should_read_files_from_BOM() throws IOException {
+ Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/");
+ assertThat(readFile(basedir.resolve("without_BOM.txt"), US_ASCII)).isEqualTo("without BOM");
+ assertThat(readFile(basedir.resolve("UTF-8.txt"), US_ASCII)).isEqualTo("UTF-8");
+ assertThat(readFile(basedir.resolve("UTF-16BE.txt"), US_ASCII)).isEqualTo("UTF-16BE");
+ assertThat(readFile(basedir.resolve("UTF-16LE.txt"), US_ASCII)).isEqualTo("UTF-16LE");
+ assertThat(readFile(basedir.resolve("UTF-32BE.txt"), US_ASCII)).isEqualTo("UTF-32BE");
+ assertThat(readFile(basedir.resolve("UTF-32LE.txt"), US_ASCII)).isEqualTo("UTF-32LE");
}
@Test
public void always_try_utf8() throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
- try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) {
- // UTF-16 can't read 1 byte only
- writer.write("t");
- }
+ // this is a valid 2 byte UTF-8.
+ out.write(194);
+ out.write(128);
Path filePath = temp.newFile().toPath();
Files.write(filePath, out.toByteArray());
- assertThat(detectCharset(filePath, StandardCharsets.UTF_16)).isEqualByComparingTo(StandardCharsets.UTF_8);
-
+ assertThat(detectCharset(filePath, UTF_16)).isEqualTo(UTF_8);
}
@Test
public void fail_if_file_doesnt_exist() {
exception.expect(IllegalStateException.class);
exception.expectMessage("Unable to read file " + Paths.get("non_existing").toAbsolutePath());
- detectCharset(Paths.get("non_existing"), StandardCharsets.UTF_8);
+ detectCharset(Paths.get("non_existing"), UTF_8);
}
@Test
@@ -83,9 +99,16 @@ public class CharsetDetectorTest {
new Random().nextBytes(b);
Files.write(filePath, b);
- CharsetDetector detector = new CharsetDetector(filePath, StandardCharsets.UTF_8);
+ CharsetDetector detector = new CharsetDetector(filePath, UTF_8);
assertThat(detector.run()).isFalse();
- assertThat(detector.charset()).isEqualTo(StandardCharsets.UTF_8);
+ assertThat(detector.charset()).isNull();
+ }
+
+ private String readFile(Path file, Charset defaultEncoding) throws IOException {
+ CharsetDetector detector = new CharsetDetector(file, defaultEncoding);
+ assertThat(detector.run()).isTrue();
+ List<String> readLines = IOUtils.readLines(new InputStreamReader(detector.inputStream(), detector.charset()));
+ return StringUtils.join(readLines, "\n");
}
private Charset detectCharset(Path file, Charset defaultEncoding) {
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java
new file mode 100644
index 00000000000..365fffb624b
--- /dev/null
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java
@@ -0,0 +1,183 @@
+/*
+ * SonarQube
+ * Copyright (C) 2009-2017 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+package org.sonar.scanner.scan.filesystem;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.lang.StringUtils;
+import org.junit.Before;
+import org.junit.Test;
+import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation;
+
+public class CharsetValidationTest {
+ private CharsetValidation charsets;
+
+ @Before
+ public void setUp() {
+ charsets = new CharsetValidation();
+ }
+
+ @Test
+ public void testWithSourceCode() throws IOException, URISyntaxException {
+ Path path = Paths.get(this.getClass().getClassLoader().getResource("mediumtest/xoo/sample/xources/hello/HelloJava.xoo").toURI());
+ List<String> lines = Files.readAllLines(path, StandardCharsets.UTF_8);
+ String text = lines.stream().collect(StringBuffer::new, StringBuffer::append, StringBuffer::append).toString();
+
+ byte[] utf8 = encode(text, StandardCharsets.UTF_8);
+ byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
+ byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
+
+ assertThat(charsets.isUTF8(utf8, true).charset()).isEqualTo(StandardCharsets.UTF_8);
+ assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
+ assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
+
+ assertThat(charsets.isValidUTF16(utf16be, false)).isTrue();
+ assertThat(charsets.isValidUTF16(utf16le, true)).isTrue();
+ }
+
+ @Test
+ public void detectUTF16NewLine() throws CharacterCodingException {
+ // the first char will be encoded with a null on the second byte, but we should still detect it due to the new line
+ String text = "\uA100" + "\uA212" + "\n";
+
+ byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
+ byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
+ byte[] utf8 = encode(text, StandardCharsets.UTF_8);
+ byte[] utf32 = encode(text, Charset.forName("UTF-32LE"));
+
+ System.out.println(Arrays.toString(utf32));
+
+ assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
+ assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
+ assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE);
+ // this will have a double null, so it will be yes or no based on failOnNull
+ assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO);
+ assertThat(charsets.isUTF16(utf32, false).valid()).isEqualTo(Validation.YES);
+ }
+
+ @Test
+ public void detectUTF16Ascii() throws CharacterCodingException {
+ String text = "some text to test";
+ byte[] utf16be = encode(text, StandardCharsets.UTF_16BE);
+ byte[] utf16le = encode(text, StandardCharsets.UTF_16LE);
+ byte[] utf8 = encode(text, StandardCharsets.UTF_8);
+ byte[] iso88591 = encode(text, StandardCharsets.ISO_8859_1);
+ byte[] utf32 = encode(text, Charset.forName("UTF-32LE"));
+
+ assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE);
+ assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE);
+ // not enough nulls -> we don't know
+ assertThat(charsets.isUTF16(iso88591, true).valid()).isEqualTo(Validation.MAYBE);
+ assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE);
+ // fail based on double nulls
+ assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO);
+ }
+
+ @Test
+ public void validUTF8() {
+ // UTF8 with 3 bytes
+ byte[] b = hexToByte("E2 80 A6");
+ assertThat(charsets.isUTF8(b, true).valid()).isEqualTo(Validation.YES);
+ }
+
+ @Test
+ public void invalidUTF16() {
+ // UTF-16 will accept anything in direct 2 byte block unless it's between D800-DFFF (high and low surrogates).
+ // In that case, it's a 4 byte encoding it's not a direct encoding.
+ byte[] b1 = hexToByte("D800 0000");
+ assertThat(charsets.isValidUTF16(b1)).isFalse();
+
+ byte[] b1le = hexToByte("0000 D800");
+ assertThat(charsets.isValidUTF16(b1le, true)).isFalse();
+
+ // not enough bytes (any byte following this one would make it valid)
+ byte[] b2 = {(byte) 0x01};
+ assertThat(charsets.isValidUTF16(b2)).isFalse();
+
+ // we reject double 0
+ byte[] b3 = {(byte) 0, (byte) 0};
+ assertThat(charsets.isValidUTF16(b3)).isFalse();
+ }
+
+ @Test
+ public void invalidUTF8() {
+ // never expects to see 0xFF or 0xC0..
+ byte[] b1 = {(byte) 0xFF};
+ assertThat(charsets.isUTF8(b1, true).valid()).isEqualTo(Validation.NO);
+
+ byte[] b1c = {(byte) 0xC0};
+ assertThat(charsets.isUTF8(b1c, true).valid()).isEqualTo(Validation.NO);
+
+ // the first byte indicates a 2-byte encoding, but second byte is not valid
+ byte[] b2 = {(byte) 0b11000010, (byte) 0b11000000};
+ assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.NO);
+
+ // we reject nulls (mainly to reject UTF-16)
+ byte[] b3 = {(byte) 0};
+ assertThat(charsets.isUTF8(b3, true).valid()).isEqualTo(Validation.NO);
+ }
+
+ @Test
+ public void dontFailIfNotEnoughBytes() {
+ byte[] b1 = hexToByte("D800");
+ assertThat(charsets.isValidUTF16(b1)).isTrue();
+
+ // the first byte indicates a 2-byte encoding, but there is no second byte
+ byte[] b2 = {(byte) 0b11000010};
+ assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.MAYBE);
+ }
+
+ private byte[] encode(String txt, Charset charset) throws CharacterCodingException {
+ CharsetEncoder encoder = charset.newEncoder()
+ .onMalformedInput(CodingErrorAction.REPORT)
+ .onUnmappableCharacter(CodingErrorAction.REPORT);
+ ByteBuffer encoded = encoder.encode(CharBuffer.wrap(txt));
+ byte[] b = new byte[encoded.remaining()];
+ encoded.get(b);
+ return b;
+ }
+
+ private static byte[] hexToByte(String str) {
+ String s = StringUtils.deleteWhitespace(str);
+ int len = s.length();
+ byte[] data = new byte[len / 2];
+ for (int i = 0; i < len; i += 2) {
+ data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4)
+ + Character.digit(s.charAt(i + 1), 16));
+ }
+ return data;
+ }
+
+}
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
index c552cf75610..697450bd782 100644
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
@@ -24,6 +24,7 @@ import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
+import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -109,6 +110,15 @@ public class MetadataGeneratorTest {
}
@Test
+ public void use_default_charset_if_detection_fails() throws IOException {
+ Path tempFile = temp.newFile().toPath();
+ byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF};
+ FileUtils.writeByteArrayToFile(tempFile.toFile(), b);
+ DefaultInputFile inputFile = createInputFileWithMetadata(tempFile);
+ assertThat(inputFile.charset()).isEqualTo(StandardCharsets.US_ASCII);
+ }
+
+ @Test
public void non_existing_file_should_throw_exception() {
try {
createInputFileWithMetadata(Paths.get(""), "non_existing");
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt
index 6c34c65bd49..a89e27e4f25 100644
--- a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt
+++ b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt
Binary files differ