From: Janos Gyerik <janos.gyerik@sonarsource.com>
Date: Thu, 8 Jun 2017 06:27:12 +0000 (+0200)
Subject: Detect Windows-1252 encoding (#2156)
X-Git-Tag: 6.5-M1~105
X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=6aac427873da6027ffb175322672b11ef13db999;p=sonarqube.git

Detect Windows-1252 encoding (#2156)
---

diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java
index e0d7fecf320..6295e44ec62 100644
--- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java
@@ -38,8 +38,8 @@ public class ByteCharsetDetector {
   private static final ByteOrderMark[] boms = {ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE,
     ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE};
 
-  private Charset userConfiguration;
-  private CharsetValidation validator;
+  private final Charset userConfiguration;
+  private final CharsetValidation validator;
 
   public ByteCharsetDetector(CharsetValidation validator, Charset userConfiguration) {
     this.validator = validator;
@@ -69,6 +69,11 @@ public class ByteCharsetDetector {
       return c;
     }
 
+    Result windows1252 = validator.isValidWindows1252(buf);
+    if (windows1252.valid() == Validation.MAYBE) {
+      return windows1252.charset();
+    }
+
     return null;
   }
 
diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java
index 5125a801a4e..05dd6b6acff 100644
--- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java
+++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java
@@ -25,7 +25,8 @@ import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
-
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Arrays;
 import javax.annotation.CheckForNull;
 import javax.annotation.Nullable;
 
@@ -34,19 +35,30 @@ public class CharsetValidation {
   private static final double UTF_16_NULL_PASS_THRESHOLD = 0.7;
   private static final double UTF_16_NULL_FAIL_THRESHOLD = 0.1;
 
+  private static final boolean[] VALID_WINDOWS_1252 = new boolean[256];
+  static {
+    Arrays.fill(VALID_WINDOWS_1252, true);
+    // See the Undefined cells in the charset table on https://en.wikipedia.org/wiki/Windows-1252
+    VALID_WINDOWS_1252[129 - 128] = false;
+    VALID_WINDOWS_1252[141 - 128] = false;
+    VALID_WINDOWS_1252[143 - 128] = false;
+    VALID_WINDOWS_1252[144 - 128] = false;
+    VALID_WINDOWS_1252[157 - 128] = false;
+  }
+
   /**
-   * Checks if an array of bytes looks UTF-16 encoded. 
+   * Checks if an array of bytes looks UTF-16 encoded.
    * We look for clues by checking the presence of nulls and new line control chars in both little and big endian byte orders.
    * Failing on nulls will greatly reduce FPs if the buffer is actually encoded in UTF-32.
-   * 
-   * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are 
+   *
+   * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are
    * within this range, we look for number of nulls and see if it is above a certain threshold.
-   * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike. 
-   * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance 
+   * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike.
+   * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance
    * for opposite nulls (10%).
-   * 
+   *
    * Line feed code point (0x000A) reversed would be (0x0A00). This code point is reserved and should never be found.
-   * 
+   *
    */
   public Result isUTF16(byte[] buffer, boolean failOnNull) {
     if (buffer.length < 2) {
@@ -115,26 +127,26 @@ public class CharsetValidation {
   }
 
   /**
-   * Checks whether it's a valid UTF-16-encoded buffer. 
+   * Checks whether it's a valid UTF-16-encoded buffer.
    * Most sequences of bytes of any encoding will be valid UTF-16, so this is not very effective and gives
    * often false positives.
-   * 
+   *
    * Possible 16bit values in UTF-16:
-   * 
+   *
    * 0x0000-0xD7FF: single 16bit block
    * 0xD800-0xDBFF: first block
    * 0xDC00-0xDFFF: second block
    * 0XE000-0xFFFF: single 16 bit block
-   * 
+   *
    * The following UTF code points get mapped into 1 or 2 blocks:
    * 0x0000 -0xD7FF   (0    -55295)  : 2 bytes, direct mapping
    * 0xE000 -0xFFFF   (57344-65535)  : 2 bytes, direct mapping
    * 0x10000-0x10FFFF (65536-1114111): 2 blocks of 2 bytes (not direct..)
-   * 
+   *
    * Note that Unicode 55296-57345 (0xD800 to 0xDFFF) are not used, since it's reserved and used in UTF-16 for the high/low surrogates.
-   * 
+   *
    * We reject 2-byte blocks with 0 (we consider it's binary) even though it's a valid UTF-16 encoding.
-   * 
+   *
    */
   public boolean isValidUTF16(byte[] buffer) {
     return isValidUTF16(buffer, false);
@@ -169,18 +181,18 @@ public class CharsetValidation {
   }
 
   /**
-   * Checks if a buffer contains only valid UTF8 encoded bytes. 
+   * Checks if a buffer contains only valid UTF8 encoded bytes.
    * It's very effective, giving a clear YES/NO, unless it's ASCII  (unicode < 127), in which case it returns MAYBE.
-   * 
-   * 
+   *
+   *
    * First byte:
    * 0xxxxxxx: only one byte (0-127)
    * 110xxxxx: 2 bytes       (194-223, as 192/193 are invalid)
    * 1110xxxx: 3 bytes       (224-239)
    * 11110xxx: 4 bytes       (240-244)
-   * 
+   *
    * Bytes 2,3 and 4 are always 10xxxxxx (0x80-0xBF or 128-191).
-   * 
+   *
    * So depending on the number of significant bits in the unicode code point, the length will be 1,2,3 or 4 bytes:
    * 0 -7 bits  (0x0000-007F):  1 byte encoding
    * 8 -11 bits (0x0080-07FF): 2 bytes encoding
@@ -254,6 +266,27 @@ public class CharsetValidation {
       : ((buffer[i / 2] & 0xff) << 8) | (buffer[i / 2 + 1] & 0xff);
   }
 
+  /**
+   * Verify that the buffer doesn't contain bytes that are not supposed to be used by Windows-1252.
+   *
+   * @return Result object with Validation.MAYBE and Windows-1252 if no unknown characters are used,
+   * otherwise Result.INVALID
+   * @param buf byte buffer to validate
+   */
+  public Result isValidWindows1252(byte[] buf) {
+    for (byte b : buf) {
+      if (!VALID_WINDOWS_1252[b + 128]) {
+        return Result.INVALID;
+      }
+    }
+
+    try {
+      return new Result(Validation.MAYBE, Charset.forName("Windows-1252"));
+    } catch (UnsupportedCharsetException e) {
+      return Result.INVALID;
+    }
+  }
+
   public enum Validation {
     NO,
     YES,
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java
index c1da6f08a24..ff3f3feb711 100644
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java
@@ -28,6 +28,7 @@ import static org.mockito.Mockito.when;
 
 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -85,6 +86,7 @@ public class ByteCharsetDetectorTest {
   public void failAll() {
     when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID);
     when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null));
+    when(validation.isValidWindows1252(any(byte[].class))).thenReturn(Result.INVALID);
 
     assertThat(charsets.detect(new byte[1])).isEqualTo(null);
   }
@@ -132,4 +134,10 @@ public class ByteCharsetDetectorTest {
     byte[] b3 = new byte[0];
     assertThat(charsets.detectBOM(b3)).isNull();
   }
+
+  @Test
+  public void windows1252() throws IOException, URISyntaxException {
+    ByteCharsetDetector detector = new ByteCharsetDetector(new CharsetValidation(), StandardCharsets.UTF_8);
+    assertThat(detector.detect(readFile("windows-1252"))).isEqualTo(Charset.forName("Windows-1252"));
+  }
 }
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java
index 365fffb624b..269bb6eef3a 100644
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java
@@ -149,6 +149,17 @@ public class CharsetValidationTest {
     assertThat(charsets.isUTF8(b3, true).valid()).isEqualTo(Validation.NO);
   }
 
+  @Test
+  public void windows_1252() {
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 129}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 141}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 143}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 144}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 157}).valid()).isEqualTo(Validation.NO);
+    assertThat(charsets.isValidWindows1252(new byte[]{(byte) 189}).valid()).isEqualTo(Validation.MAYBE);
+    assertThat(charsets.isUTF8(new byte[]{(byte) 189}, true).valid()).isEqualTo(Validation.NO);
+  }
+
   @Test
   public void dontFailIfNotEnoughBytes() {
     byte[] b1 = hexToByte("D800");
diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
index 697450bd782..ae73b76db13 100644
--- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
+++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java
@@ -112,7 +112,8 @@ public class MetadataGeneratorTest {
   @Test
   public void use_default_charset_if_detection_fails() throws IOException {
     Path tempFile = temp.newFile().toPath();
-    byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF};
+    byte invalidWindows1252 = (byte) 129;
+    byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF, invalidWindows1252};
     FileUtils.writeByteArrayToFile(tempFile.toFile(), b);
     DefaultInputFile inputFile = createInputFileWithMetadata(tempFile);
     assertThat(inputFile.charset()).isEqualTo(StandardCharsets.US_ASCII);
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt
new file mode 100644
index 00000000000..ed1484ada21
--- /dev/null
+++ b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/windows-1252.txt
@@ -0,0 +1,30 @@
+using System.Collections.Generic;
+
+namespace SonarQube.Encoding.Tests
+{
+  public enum Western_European_aka_Windows_1252_no_encoding_detected_by_Roslyn
+  {
+  }
+
+  public class KeySuggestion
+  {
+
+  Dictionary<string, string> specialCharSequences = new Dictionary<string, string>
+                                                        {
+                                                          { "_dots_", "..." },
+                                                          { "_colon_", ":" },
+                                                          { "_slash_", "/" },
+                                                          { "_backslash_", "\\" },
+                                                          { "_percentage_", "%" },
+                                                          { "_dash_", "-" },
+                                                          { "_half_", "½" },
+                                                          { "_plus_", "+" },
+                                                          { "_amp_", "&" },
+                                                          { "_pow_", "^" },
+                                                          { "_currency_", "¤" },
+                                                          { "_degrees_", "°" },
+                                                          { "_equals_", "=" },
+                                                          { "_hash_", "#" }
+                                                        };
+  }
+}