]> source.dussan.org Git - sonarqube.git/commitdiff
SONAR-6337 Detect encoding of a file from BOM
authorEvgeny Mandrikov <Godin@users.noreply.github.com>
Fri, 21 Oct 2016 11:54:45 +0000 (13:54 +0200)
committerJulien HENRY <henryju@yahoo.fr>
Fri, 21 Oct 2016 11:54:45 +0000 (13:54 +0200)
sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/InputFileBuilder.java
sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/InputFileBuilderTest.java
sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16BE.txt [new file with mode: 0644]
sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16LE.txt [new file with mode: 0644]
sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32BE.txt [new file with mode: 0644]
sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt [new file with mode: 0644]
sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-8.txt [new file with mode: 0644]
sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/without_BOM.txt [new file with mode: 0644]

index dd85cecf8871f35d9e2b12d8f9406cc9f930531e..511a01ed64484714b1b169c409167a14b7bcf541 100644 (file)
@@ -19,6 +19,7 @@
  */
 package org.sonar.scanner.scan.filesystem;
 
+import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.sonar.api.CoreProperties;
@@ -32,6 +33,10 @@ import org.sonar.api.scan.filesystem.PathResolver;
 import javax.annotation.CheckForNull;
 
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
 class InputFileBuilder {
 
@@ -93,7 +98,6 @@ class InputFileBuilder {
   DefaultInputFile completeAndComputeMetadata(DefaultInputFile inputFile, InputFile.Type type) {
     inputFile.setType(type);
     inputFile.setModuleBaseDir(fs.baseDir().toPath());
-    inputFile.setCharset(fs.encoding());
 
     String lang = langDetection.language(inputFile);
     if (lang == null && !settings.getBoolean(CoreProperties.IMPORT_UNKNOWN_FILES_KEY)) {
@@ -101,11 +105,46 @@ class InputFileBuilder {
     }
     inputFile.setLanguage(lang);
 
-    inputFile.initMetadata(fileMetadata.readMetadata(inputFile.file(), fs.encoding()));
+    Charset charset = detectCharset(inputFile.file(), fs.encoding());
+    inputFile.setCharset(charset);
+
+    inputFile.initMetadata(fileMetadata.readMetadata(inputFile.file(), charset));
 
     inputFile.setStatus(statusDetection.status(inputFile.moduleKey(), inputFile.relativePath(), inputFile.hash()));
 
     return inputFile;
   }
 
+  /**
+   * @return charset detected from BOM in given file or given defaultCharset
+   * @throws IllegalStateException if an I/O error occurs
+   */
+  private static Charset detectCharset(File file, Charset defaultCharset) {
+    try (FileInputStream inputStream = new FileInputStream(file)) {
+      byte[] bom = new byte[4];
+      int n = inputStream.read(bom, 0, bom.length);
+      if ((n >= 3) && (bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
+        return StandardCharsets.UTF_8;
+      } else if ((n >= 4) && (bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
+        return UTF_32BE;
+      } else if ((n >= 4) && (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
+        return UTF_32LE;
+      } else if ((n >= 2) && (bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
+        return StandardCharsets.UTF_16BE;
+      } else if ((n >= 2) && (bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
+        return StandardCharsets.UTF_16LE;
+      } else {
+        return defaultCharset;
+      }
+    } catch (IOException e) {
+      throw new IllegalStateException("Unable to read file " + file.getAbsolutePath(), e);
+    }
+  }
+
+  @VisibleForTesting
+  static final Charset UTF_32BE = Charset.forName("UTF-32BE");
+
+  @VisibleForTesting
+  static final Charset UTF_32LE = Charset.forName("UTF-32LE");
+
 }
index 63fe01d8eb5b71b39b192b156dd9f9dbed8b144e..d8b01f1b14e2b215c975511f42806eaf3d26dd86 100644 (file)
 package org.sonar.scanner.scan.filesystem;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.nio.charset.StandardCharsets;
 import org.apache.commons.io.FileUtils;
+import org.junit.Assert;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
@@ -46,6 +48,42 @@ public class InputFileBuilderTest {
   StatusDetection statusDetection = mock(StatusDetection.class);
   DefaultModuleFileSystem fs = mock(DefaultModuleFileSystem.class);
 
+  @Test
+  public void should_detect_charset_from_BOM() {
+    File basedir = new File("src/test/resources/org/sonar/scanner/scan/filesystem/");
+    when(fs.baseDir()).thenReturn(basedir);
+    when(fs.encoding()).thenReturn(StandardCharsets.US_ASCII);
+    when(langDetection.language(any(InputFile.class))).thenReturn("java");
+    InputFileBuilder builder = new InputFileBuilder("moduleKey", new PathResolver(), langDetection, statusDetection, fs, new MapSettings(), new FileMetadata());
+
+    assertThat(createAndComplete(builder, new File(basedir, "without_BOM.txt")).charset())
+      .isEqualTo(StandardCharsets.US_ASCII);
+    assertThat(createAndComplete(builder, new File(basedir, "UTF-8.txt")).charset())
+      .isEqualTo(StandardCharsets.UTF_8);
+    assertThat(createAndComplete(builder, new File(basedir, "UTF-16BE.txt")).charset())
+      .isEqualTo(StandardCharsets.UTF_16BE);
+    assertThat(createAndComplete(builder, new File(basedir, "UTF-16LE.txt")).charset())
+      .isEqualTo(StandardCharsets.UTF_16LE);
+    assertThat(createAndComplete(builder, new File(basedir, "UTF-32BE.txt")).charset())
+      .isEqualTo(InputFileBuilder.UTF_32BE);
+    assertThat(createAndComplete(builder, new File(basedir, "UTF-32LE.txt")).charset())
+      .isEqualTo(InputFileBuilder.UTF_32LE);
+
+    try {
+      createAndComplete(builder, new File(basedir, "non_existing"));
+      Assert.fail();
+    } catch (IllegalStateException e) {
+      assertThat(e.getMessage()).isEqualTo("Unable to read file " + new File(basedir, "non_existing").getAbsolutePath());
+      assertThat(e.getCause()).isInstanceOf(FileNotFoundException.class);
+    }
+  }
+
+  private static DefaultInputFile createAndComplete(InputFileBuilder builder, File file) {
+    DefaultInputFile inputFile = builder.create(file);
+    builder.completeAndComputeMetadata(inputFile, InputFile.Type.MAIN);
+    return inputFile;
+  }
+
   @Test
   public void complete_input_file() throws Exception {
     // file system
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16BE.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16BE.txt
new file mode 100644 (file)
index 0000000..c7c42e9
Binary files /dev/null and b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16BE.txt differ
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16LE.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16LE.txt
new file mode 100644 (file)
index 0000000..53642b6
Binary files /dev/null and b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-16LE.txt differ
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32BE.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32BE.txt
new file mode 100644 (file)
index 0000000..c5efe6c
Binary files /dev/null and b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32BE.txt differ
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt
new file mode 100644 (file)
index 0000000..6c34c65
Binary files /dev/null and b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt differ
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-8.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-8.txt
new file mode 100644 (file)
index 0000000..ca971be
--- /dev/null
@@ -0,0 +1 @@
+UTF-8
diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/without_BOM.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/without_BOM.txt
new file mode 100644 (file)
index 0000000..9812cbf
--- /dev/null
@@ -0,0 +1 @@
+without BOM