From: Janos Gyerik Date: Thu, 8 Jun 2017 06:27:12 +0000 (+0200) Subject: Detect Windows-1252 encoding (#2156) X-Git-Tag: 6.5-M1~105 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=6aac427873da6027ffb175322672b11ef13db999;p=sonarqube.git Detect Windows-1252 encoding (#2156) --- diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java index e0d7fecf320..6295e44ec62 100644 --- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java @@ -38,8 +38,8 @@ public class ByteCharsetDetector { private static final ByteOrderMark[] boms = {ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE}; - private Charset userConfiguration; - private CharsetValidation validator; + private final Charset userConfiguration; + private final CharsetValidation validator; public ByteCharsetDetector(CharsetValidation validator, Charset userConfiguration) { this.validator = validator; @@ -69,6 +69,11 @@ public class ByteCharsetDetector { return c; } + Result windows1252 = validator.isValidWindows1252(buf); + if (windows1252.valid() == Validation.MAYBE) { + return windows1252.charset(); + } + return null; } diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java index 5125a801a4e..05dd6b6acff 100644 --- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java @@ -25,7 +25,8 @@ import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; - +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; import javax.annotation.CheckForNull; import javax.annotation.Nullable; @@ -34,19 +35,30 @@ public class CharsetValidation { private static final double UTF_16_NULL_PASS_THRESHOLD = 0.7; private static final double UTF_16_NULL_FAIL_THRESHOLD = 0.1; + private static final boolean[] VALID_WINDOWS_1252 = new boolean[256]; + static { + Arrays.fill(VALID_WINDOWS_1252, true); + // See the Undefined cells in the charset table on https://en.wikipedia.org/wiki/Windows-1252 + VALID_WINDOWS_1252[129 - 128] = false; + VALID_WINDOWS_1252[141 - 128] = false; + VALID_WINDOWS_1252[143 - 128] = false; + VALID_WINDOWS_1252[144 - 128] = false; + VALID_WINDOWS_1252[157 - 128] = false; + } + /** - * Checks if an array of bytes looks UTF-16 encoded. + * Checks if an array of bytes looks UTF-16 encoded. * We look for clues by checking the presence of nulls and new line control chars in both little and big endian byte orders. * Failing on nulls will greatly reduce FPs if the buffer is actually encoded in UTF-32. - * - * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are + * + * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are * within this range, we look for number of nulls and see if it is above a certain threshold. - * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike. - * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance + * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike. + * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance * for opposite nulls (10%). - * + * * Line feed code point (0x000A) reversed would be (0x0A00). This code point is reserved and should never be found. - * + * */ public Result isUTF16(byte[] buffer, boolean failOnNull) { if (buffer.length < 2) { @@ -115,26 +127,26 @@ public class CharsetValidation { } /** - * Checks whether it's a valid UTF-16-encoded buffer. + * Checks whether it's a valid UTF-16-encoded buffer. * Most sequences of bytes of any encoding will be valid UTF-16, so this is not very effective and gives * often false positives. - * + * * Possible 16bit values in UTF-16: - * + * * 0x0000-0xD7FF: single 16bit block * 0xD800-0xDBFF: first block * 0xDC00-0xDFFF: second block * 0XE000-0xFFFF: single 16 bit block - * + * * The following UTF code points get mapped into 1 or 2 blocks: * 0x0000 -0xD7FF (0 -55295) : 2 bytes, direct mapping * 0xE000 -0xFFFF (57344-65535) : 2 bytes, direct mapping * 0x10000-0x10FFFF (65536-1114111): 2 blocks of 2 bytes (not direct..) - * + * * Note that Unicode 55296-57345 (0xD800 to 0xDFFF) are not used, since it's reserved and used in UTF-16 for the high/low surrogates. - * + * * We reject 2-byte blocks with 0 (we consider it's binary) even though it's a valid UTF-16 encoding. - * + * */ public boolean isValidUTF16(byte[] buffer) { return isValidUTF16(buffer, false); @@ -169,18 +181,18 @@ public class CharsetValidation { } /** - * Checks if a buffer contains only valid UTF8 encoded bytes. + * Checks if a buffer contains only valid UTF8 encoded bytes. * It's very effective, giving a clear YES/NO, unless it's ASCII (unicode < 127), in which case it returns MAYBE. - * - * + * + * * First byte: * 0xxxxxxx: only one byte (0-127) * 110xxxxx: 2 bytes (194-223, as 192/193 are invalid) * 1110xxxx: 3 bytes (224-239) * 11110xxx: 4 bytes (240-244) - * + * * Bytes 2,3 and 4 are always 10xxxxxx (0x80-0xBF or 128-191). - * + * * So depending on the number of significant bits in the unicode code point, the length will be 1,2,3 or 4 bytes: * 0 -7 bits (0x0000-007F): 1 byte encoding * 8 -11 bits (0x0080-07FF): 2 bytes encoding @@ -254,6 +266,27 @@ public class CharsetValidation { : ((buffer[i / 2] & 0xff) << 8) | (buffer[i / 2 + 1] & 0xff); } + /** + * Verify that the buffer doesn't contain bytes that are not supposed to be used by Windows-1252. + * + * @return Result object with Validation.MAYBE and Windows-1252 if no unknown characters are used, + * otherwise Result.INVALID + * @param buf byte buffer to validate + */ + public Result isValidWindows1252(byte[] buf) { + for (byte b : buf) { + if (!VALID_WINDOWS_1252[b + 128]) { + return Result.INVALID; + } + } + + try { + return new Result(Validation.MAYBE, Charset.forName("Windows-1252")); + } catch (UnsupportedCharsetException e) { + return Result.INVALID; + } + } + public enum Validation { NO, YES, diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java index c1da6f08a24..ff3f3feb711 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java @@ -28,6 +28,7 @@ import static org.mockito.Mockito.when; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; @@ -85,6 +86,7 @@ public class ByteCharsetDetectorTest { public void failAll() { when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID); when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null)); + when(validation.isValidWindows1252(any(byte[].class))).thenReturn(Result.INVALID); assertThat(charsets.detect(new byte[1])).isEqualTo(null); } @@ -132,4 +134,10 @@ public class ByteCharsetDetectorTest { byte[] b3 = new byte[0]; assertThat(charsets.detectBOM(b3)).isNull(); } + + @Test + public void windows1252() throws IOException, URISyntaxException { + ByteCharsetDetector detector = new ByteCharsetDetector(new CharsetValidation(), StandardCharsets.UTF_8); + assertThat(detector.detect(readFile("windows-1252"))).isEqualTo(Charset.forName("Windows-1252")); + } } diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java index 365fffb624b..269bb6eef3a 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java @@ -149,6 +149,17 @@ public class CharsetValidationTest { assertThat(charsets.isUTF8(b3, true).valid()).isEqualTo(Validation.NO); } + @Test + public void windows_1252() { + assertThat(charsets.isValidWindows1252(new byte[]{(byte) 129}).valid()).isEqualTo(Validation.NO); + assertThat(charsets.isValidWindows1252(new byte[]{(byte) 141}).valid()).isEqualTo(Validation.NO); + assertThat(charsets.isValidWindows1252(new byte[]{(byte) 143}).valid()).isEqualTo(Validation.NO); + assertThat(charsets.isValidWindows1252(new byte[]{(byte) 144}).valid()).isEqualTo(Validation.NO); + assertThat(charsets.isValidWindows1252(new byte[]{(byte) 157}).valid()).isEqualTo(Validation.NO); + assertThat(charsets.isValidWindows1252(new byte[]{(byte) 189}).valid()).isEqualTo(Validation.MAYBE); + assertThat(charsets.isUTF8(new byte[]{(byte) 189}, true).valid()).isEqualTo(Validation.NO); + } + @Test public void dontFailIfNotEnoughBytes() { byte[] b1 = hexToByte("D800"); diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java index 697450bd782..ae73b76db13 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java @@ -112,7 +112,8 @@ public class MetadataGeneratorTest { @Test public void use_default_charset_if_detection_fails() throws IOException { Path tempFile = temp.newFile().toPath(); - byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF}; + byte invalidWindows1252 = (byte) 129; + byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF, invalidWindows1252}; FileUtils.writeByteArrayToFile(tempFile.toFile(), b); DefaultInputFile inputFile = createInputFileWithMetadata(tempFile); assertThat(inputFile.charset()).isEqualTo(StandardCharsets.US_ASCII); diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt new file mode 100644 index 00000000000..ed1484ada21 --- /dev/null +++ b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt @@ -0,0 +1,30 @@ +using System.Collections.Generic; + +namespace SonarQube.Encoding.Tests +{ + public enum Western_European_aka_Windows_1252_no_encoding_detected_by_Roslyn + { + } + + public class KeySuggestion + { + + Dictionary specialCharSequences = new Dictionary + { + { "_dots_", "..." }, + { "_colon_", ":" }, + { "_slash_", "/" }, + { "_backslash_", "\\" }, + { "_percentage_", "%" }, + { "_dash_", "-" }, + { "_half_", "½" }, + { "_plus_", "+" }, + { "_amp_", "&" }, + { "_pow_", "^" }, + { "_currency_", "¤" }, + { "_degrees_", "°" }, + { "_equals_", "=" }, + { "_hash_", "#" } + }; + } +}