Detect Windows-1252 encoding (#2156)

author Janos Gyerik <janos.gyerik@sonarsource.com>

Thu, 8 Jun 2017 06:27:12 +0000 (08:27 +0200)

committer GitHub <noreply@github.com>

Thu, 8 Jun 2017 06:27:12 +0000 (08:27 +0200)
author Janos Gyerik <janos.gyerik@sonarsource.com>
Thu, 8 Jun 2017 06:27:12 +0000 (08:27 +0200)
committer GitHub <noreply@github.com>
Thu, 8 Jun 2017 06:27:12 +0000 (08:27 +0200)
diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java

index e0d7fecf32039950c636ca1a543df7e53fb1fb2f..6295e44ec62b6aa24b2f84f2a5442fcb52d8eac2 100644 (file)
--- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java
@@ -38,8 +38,8 @@ public class ByteCharsetDetector {
    private static final ByteOrderMark[] boms = {ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE,
      ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE};
  
-  private Charset userConfiguration;
-  private CharsetValidation validator;
+  private final Charset userConfiguration;
+  private final CharsetValidation validator;
  
    public ByteCharsetDetector(CharsetValidation validator, Charset userConfiguration) {
      this.validator = validator;
@@ -69,6 +69,11 @@ public class ByteCharsetDetector {
        return c;
      }
  
+    Result windows1252 = validator.isValidWindows1252(buf);
+    if (windows1252.valid() == Validation.MAYBE) {
+      return windows1252.charset();
+    }
+
      return null;
    }
  
diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java

index 5125a801a4e2ff66a41f6c4270457830e7aa7f88..05dd6b6acff0c1e30a56a488eb5dd0a7772bd3cf 100644 (file)
--- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java
@@ -25,7 +25,8 @@ import java.nio.charset.Charset;
  import java.nio.charset.CharsetDecoder;
  import java.nio.charset.CodingErrorAction;
  import java.nio.charset.StandardCharsets;
-
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Arrays;
  import javax.annotation.CheckForNull;
  import javax.annotation.Nullable;
  
@@ -34,19 +35,30 @@ public class CharsetValidation {
    private static final double UTF_16_NULL_PASS_THRESHOLD = 0.7;
    private static final double UTF_16_NULL_FAIL_THRESHOLD = 0.1;
  
+  private static final boolean[] VALID_WINDOWS_1252 = new boolean[256];
+  static {
+    Arrays.fill(VALID_WINDOWS_1252, true);
+    // See the Undefined cells in the charset table on https://en.wikipedia.org/wiki/Windows-1252
+    VALID_WINDOWS_1252[129 - 128] = false;
+    VALID_WINDOWS_1252[141 - 128] = false;
+    VALID_WINDOWS_1252[143 - 128] = false;
+    VALID_WINDOWS_1252[144 - 128] = false;
+    VALID_WINDOWS_1252[157 - 128] = false;
+  }
+
    /**
-   * Checks if an array of bytes looks UTF-16 encoded. 
+   * Checks if an array of bytes looks UTF-16 encoded.
     * We look for clues by checking the presence of nulls and new line control chars in both little and big endian byte orders.
     * Failing on nulls will greatly reduce FPs if the buffer is actually encoded in UTF-32.
-   * 
-   * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are 
+   *
+   * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are
     * within this range, we look for number of nulls and see if it is above a certain threshold.
-   * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike. 
-   * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance 
+   * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike.
+   * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance
     * for opposite nulls (10%).
-   * 
+   *
     * Line feed code point (0x000A) reversed would be (0x0A00). This code point is reserved and should never be found.
-   * 
+   *
     */
    public Result isUTF16(byte[] buffer, boolean failOnNull) {
      if (buffer.length < 2) {
@@ -115,26 +127,26 @@ public class CharsetValidation {
    }
  
    /**
-   * Checks whether it's a valid UTF-16-encoded buffer. 
+   * Checks whether it's a valid UTF-16-encoded buffer.
     * Most sequences of bytes of any encoding will be valid UTF-16, so this is not very effective and gives
     * often false positives.
-   * 
+   *
     * Possible 16bit values in UTF-16:
-   * 
+   *
     * 0x0000-0xD7FF: single 16bit block
     * 0xD800-0xDBFF: first block
     * 0xDC00-0xDFFF: second block
     * 0XE000-0xFFFF: single 16 bit block
-   * 
+   *
     * The following UTF code points get mapped into 1 or 2 blocks:
     * 0x0000 -0xD7FF   (0    -55295)  : 2 bytes, direct mapping
     * 0xE000 -0xFFFF   (57344-65535)  : 2 bytes, direct mapping
     * 0x10000-0x10FFFF (65536-1114111): 2 blocks of 2 bytes (not direct..)
-   * 
+   *
     * Note that Unicode 55296-57345 (0xD800 to 0xDFFF) are not used, since it's reserved and used in UTF-16 for the high/low surrogates.
-   * 
+   *
     * We reject 2-byte blocks with 0 (we consider it's binary) even though it's a valid UTF-16 encoding.
-   * 
+   *
     */
    public boolean isValidUTF16(byte[] buffer) {
      return isValidUTF16(buffer, false);
@@ -169,18 +181,18 @@ public class CharsetValidation {
    }
  
    /**
-   * Checks if a buffer contains only valid UTF8 encoded bytes. 
+   * Checks if a buffer contains only valid UTF8 encoded bytes.
     * It's very effective, giving a clear YES/NO, unless it's ASCII  (unicode < 127), in which case it returns MAYBE.
-   * 
-   * 
+   *
+   *
     * First byte:
     * 0xxxxxxx: only one byte (0-127)
     * 110xxxxx: 2 bytes       (194-223, as 192/193 are invalid)
     * 1110xxxx: 3 bytes       (224-239)
     * 11110xxx: 4 bytes       (240-244)
-   * 
+   *
     * Bytes 2,3 and 4 are always 10xxxxxx (0x80-0xBF or 128-191).
-   * 
+   *
     * So depending on the number of significant bits in the unicode code point, the length will be 1,2,3 or 4 bytes:
     * 0 -7 bits  (0x0000-007F):  1 byte encoding
     * 8 -11 bits (0x0080-07FF): 2 bytes encoding
@@ -254,6 +266,27 @@ public class CharsetValidation {
        : ((buffer[i / 2] & 0xff) << 8) | (buffer[i / 2 + 1] & 0xff);
    }
  
+  /**
+   * Verify that the buffer doesn't contain bytes that are not supposed to be used by Windows-1252.
+   *
+   * @return Result object with Validation.MAYBE and Windows-1252 if no unknown characters are used,
+   * otherwise Result.INVALID
+   * @param buf byte buffer to validate
+   */
+  public Result isValidWindows1252(byte[] buf) {
+    for (byte b : buf) {
+      if (!VALID_WINDOWS_1252[b + 128]) {
+        return Result.INVALID;
+      }
+    }
+
+    try {
+      return new Result(Validation.MAYBE, Charset.forName("Windows-1252"));
+    } catch (UnsupportedCharsetException e) {
+      return Result.INVALID;
+    }
+  }
+
    public enum Validation {
      NO,
      YES,
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java

index c1da6f08a242e2755af4eb34087d46ca091dd5ed..ff3f3feb7113caf9080ae78dc3f1a121a0954090 100644 (file)
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java
@@ -28,6 +28,7 @@ import static org.mockito.Mockito.when;
  
  import java.io.IOException;
  import java.net.URISyntaxException;
+import java.nio.charset.Charset;
  import java.nio.charset.StandardCharsets;
  import java.nio.file.Files;
  import java.nio.file.Path;
@@ -85,6 +86,7 @@ public class ByteCharsetDetectorTest {
    public void failAll() {
      when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID);
      when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null));
+    when(validation.isValidWindows1252(any(byte[].class))).thenReturn(Result.INVALID);
  
      assertThat(charsets.detect(new byte[1])).isEqualTo(null);
    }
@@ -132,4 +134,10 @@ public class ByteCharsetDetectorTest {
      byte[] b3 = new byte[0];
      assertThat(charsets.detectBOM(b3)).isNull();
    }
+
+  @Test
+  public void windows1252() throws IOException, URISyntaxException {
+    ByteCharsetDetector detector = new ByteCharsetDetector(new CharsetValidation(), StandardCharsets.UTF_8);
+    assertThat(detector.detect(readFile("windows-1252"))).isEqualTo(Charset.forName("Windows-1252"));
+  }
  }
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java

index 365fffb624b33c6e17d76363f4b57220e3e2de7b..269bb6eef3ae0366388e71c6a678821e625cddbe 100644 (file)
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java
@@ -149,6 +149,17 @@ public class CharsetValidationTest {
      assertThat(charsets.isUTF8(b3, true).valid()).isEqualTo(Validation.NO);
    }
  
+  @Test
+  public void windows_1252() {
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 129}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 141}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 143}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 144}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 157}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 189}).valid()).isEqualTo(Validation.MAYBE);
+    assertThat(charsets.isUTF8(new byte[]{(byte) 189}, true).valid()).isEqualTo(Validation.NO);
+  }
+
    @Test
    public void dontFailIfNotEnoughBytes() {
      byte[] b1 = hexToByte("D800");
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java

index 697450bd782b4d014d261f882164327d28f9c8d4..ae73b76db13a9e2025e770bb660826d5c36c4d5b 100644 (file)
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
@@ -112,7 +112,8 @@ public class MetadataGeneratorTest {
    @Test
    public void use_default_charset_if_detection_fails() throws IOException {
      Path tempFile = temp.newFile().toPath();
-    byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF};
+    byte invalidWindows1252 = (byte) 129;
+    byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF, invalidWindows1252};
      FileUtils.writeByteArrayToFile(tempFile.toFile(), b);
      DefaultInputFile inputFile = createInputFileWithMetadata(tempFile);
      assertThat(inputFile.charset()).isEqualTo(StandardCharsets.US_ASCII);
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt

new file mode 100644 (file)

index 0000000..ed1484a
--- /dev/null
+++ b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt
@@ -0,0 +1,30 @@
+using System.Collections.Generic;
+
+namespace SonarQube.Encoding.Tests
+{
+  public enum Western_European_aka_Windows_1252_no_encoding_detected_by_Roslyn
+  {
+  }
+
+  public class KeySuggestion
+  {
+
+  Dictionary<string, string> specialCharSequences = new Dictionary<string, string>
+                                                        {
+                                                          { "_dots_", "..." },
+                                                          { "_colon_", ":" },
+                                                          { "_slash_", "/" },
+                                                          { "_backslash_", "\\" },
+                                                          { "_percentage_", "%" },
+                                                          { "_dash_", "-" },
+                                                          { "_half_", "½" },
+                                                          { "_plus_", "+" },
+                                                          { "_amp_", "&" },
+                                                          { "_pow_", "^" },
+                                                          { "_currency_", "¤" },
+                                                          { "_degrees_", "°" },
+                                                          { "_equals_", "=" },
+                                                          { "_hash_", "#" }
+                                                        };
+  }
+}
author	Janos Gyerik <janos.gyerik@sonarsource.com>
	Thu, 8 Jun 2017 06:27:12 +0000 (08:27 +0200)
committer	GitHub <noreply@github.com>
	Thu, 8 Jun 2017 06:27:12 +0000 (08:27 +0200)
sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java		patch \| blob \| history
sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java		patch \| blob \| history
sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java		patch \| blob \| history
sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java		patch \| blob \| history
sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java		patch \| blob \| history
sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt	[new file with mode: 0644]	patch \| blob