aboutsummaryrefslogtreecommitdiffstats
path: root/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java
blob: 5cfeb8f1851251270dd2eb9681957e8078ec56d9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/*
 * SonarQube
 * Copyright (C) 2009-2025 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonar.scanner.scan.filesystem;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.Random;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_16;
import static java.nio.charset.StandardCharsets.UTF_16BE;
import static java.nio.charset.StandardCharsets.UTF_16LE;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

public class CharsetDetectorTest {
  @Rule
  public TemporaryFolder temp = new TemporaryFolder();

  @Test
  public void should_detect_charset_from_BOM() {
    Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/");

    assertThat(detectCharset(basedir.resolve("without_BOM.txt"), US_ASCII)).isEqualTo(US_ASCII);
    assertThat(detectCharset(basedir.resolve("UTF-8.txt"), US_ASCII)).isEqualTo(UTF_8);
    assertThat(detectCharset(basedir.resolve("UTF-16BE.txt"), US_ASCII)).isEqualTo(UTF_16BE);
    assertThat(detectCharset(basedir.resolve("UTF-16LE.txt"), US_ASCII)).isEqualTo(UTF_16LE);
    assertThat(detectCharset(basedir.resolve("UTF-32BE.txt"), US_ASCII)).isEqualTo(MetadataGenerator.UTF_32BE);
    assertThat(detectCharset(basedir.resolve("UTF-32LE.txt"), US_ASCII)).isEqualTo(MetadataGenerator.UTF_32LE);
  }

  @Test
  public void should_read_files_from_BOM() throws IOException {
    Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/");
    assertThat(readFile(basedir.resolve("without_BOM.txt"), US_ASCII)).isEqualTo("without BOM");
    assertThat(readFile(basedir.resolve("UTF-8.txt"), US_ASCII)).isEqualTo("UTF-8");
    assertThat(readFile(basedir.resolve("UTF-16BE.txt"), US_ASCII)).isEqualTo("UTF-16BE");
    assertThat(readFile(basedir.resolve("UTF-16LE.txt"), US_ASCII)).isEqualTo("UTF-16LE");
    assertThat(readFile(basedir.resolve("UTF-32BE.txt"), US_ASCII)).isEqualTo("UTF-32BE");
    assertThat(readFile(basedir.resolve("UTF-32LE.txt"), US_ASCII)).isEqualTo("UTF-32LE");
  }

  @Test
  public void always_try_utf8() throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    // this is a valid 2 byte UTF-8.
    out.write(194);
    out.write(128);

    Path filePath = temp.newFile().toPath();
    Files.write(filePath, out.toByteArray());
    assertThat(detectCharset(filePath, UTF_16)).isEqualTo(UTF_8);
  }

  @Test
  public void fail_if_file_doesnt_exist() {
    assertThatThrownBy(() -> detectCharset(Paths.get("non_existing"), UTF_8))
      .isInstanceOf(IllegalStateException.class)
      .hasMessage("Unable to read file " + Paths.get("non_existing").toAbsolutePath());
  }

  @Test
  public void no_encoding_found() throws IOException {
    Path filePath = temp.newFile().toPath();
    byte[] b = new byte[4096];
    new Random().nextBytes(b);
    // avoid accidental BOM matching
    b[0] = 1;

    // avoid UTF-8 / UTF-16
    b[100] = 0;
    b[101] = 0;
    b[102] = 0;
    b[103] = 0;

    // invalid in win-1258
    b[200] = (byte) 129;

    Files.write(filePath, b);

    CharsetDetector detector = new CharsetDetector(filePath, UTF_8);
    assertThat(detector.run()).isFalse();
    assertThat(detector.charset()).isNull();
  }

  private String readFile(Path file, Charset defaultEncoding) throws IOException {
    CharsetDetector detector = new CharsetDetector(file, defaultEncoding);
    assertThat(detector.run()).isTrue();
    List<String> readLines = IOUtils.readLines(new InputStreamReader(detector.inputStream(), detector.charset()));
    return StringUtils.join(readLines, "\n");
  }

  private Charset detectCharset(Path file, Charset defaultEncoding) {
    CharsetDetector detector = new CharsetDetector(file, defaultEncoding);
    assertThat(detector.run()).isTrue();
    return detector.charset();
  }
}