From 8857bdee5050c71a919085803ad23d292181eaaf Mon Sep 17 00:00:00 2001 From: Duarte Meneses Date: Wed, 3 May 2017 16:03:24 +0200 Subject: [PATCH] SONAR-6100 Improve support of binary files and/or files with different encoding --- .../scan/filesystem/ByteCharsetDetector.java | 111 +++++++ .../scan/filesystem/CharsetDetector.java | 90 ++---- .../scan/filesystem/CharsetValidation.java | 289 ++++++++++++++++++ .../scan/filesystem/MetadataGenerator.java | 13 +- .../filesystem/ByteCharsetDetectorTest.java | 135 ++++++++ .../scan/filesystem/CharsetDetectorTest.java | 57 ++-- .../filesystem/CharsetValidationTest.java | 183 +++++++++++ .../filesystem/MetadataGeneratorTest.java | 10 + .../scanner/scan/filesystem/UTF-32LE.txt | Bin 40 -> 36 bytes 9 files changed, 803 insertions(+), 85 deletions(-) create mode 100644 sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java create mode 100644 sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java create mode 100644 sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java create mode 100644 sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java new file mode 100644 index 00000000000..e0d7fecf320 --- /dev/null +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetector.java @@ -0,0 +1,111 @@ +/* + * SonarQube + * Copyright (C) 2009-2017 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.scanner.scan.filesystem; + +import static java.nio.charset.StandardCharsets.UTF_16; +import static java.nio.charset.StandardCharsets.UTF_16LE; +import static java.nio.charset.StandardCharsets.UTF_16BE; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.nio.charset.Charset; +import java.util.Arrays; + +import javax.annotation.CheckForNull; + +import org.apache.commons.io.ByteOrderMark; +import org.sonar.scanner.scan.filesystem.CharsetValidation.Result; +import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation; + +public class ByteCharsetDetector { + // these needs to be sorted by longer first! + private static final ByteOrderMark[] boms = {ByteOrderMark.UTF_8, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE, + ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE}; + + private Charset userConfiguration; + private CharsetValidation validator; + + public ByteCharsetDetector(CharsetValidation validator, Charset userConfiguration) { + this.validator = validator; + this.userConfiguration = userConfiguration; + } + + @CheckForNull + public Charset detect(byte[] buf) { + // Try UTF-8 first since we are very confident in it if it's a yes. + // Fail if we see nulls to not have FPs if the text is ASCII encoded in UTF-16. + Result utf8Result = validator.isUTF8(buf, true); + if (utf8Result.valid() == Validation.YES) { + return utf8Result.charset(); + } else if (utf8Result.valid() == Validation.MAYBE) { + return detectAscii(buf); + } + + // try UTF16 with both endiness. Fail if we see nulls to not have FPs if it's UTF-32. + Result utf16 = validator.isUTF16(buf, true); + if (utf16.valid() == Validation.YES && validator.isValidUTF16(buf, UTF_16LE.equals(utf16.charset()))) { + return utf16.charset(); + } + + // at this point we know it can't be UTF-8 + Charset c = userConfiguration; + if (!UTF_8.equals(c) && (!isUtf16(c) || utf16.valid() == Validation.MAYBE) && validator.tryDecode(buf, c)) { + return c; + } + + return null; + } + + private Charset detectAscii(byte[] buf) { + if (!isUtf16Or32(userConfiguration) && validator.tryDecode(buf, userConfiguration)) { + return userConfiguration; + } + + return null; + } + + private static boolean isUtf16(Charset charset) { + return UTF_16.equals(charset) || UTF_16BE.equals(charset) || UTF_16LE.equals(charset); + } + + private static boolean isUtf16Or32(Charset charset) { + return isUtf16(charset) || MetadataGenerator.UTF_32BE.equals(charset) || MetadataGenerator.UTF_32LE.equals(charset); + } + + @CheckForNull + public ByteOrderMark detectBOM(byte[] buffer) { + return Arrays.stream(boms) + .filter(b -> isBom(b, buffer)) + .findAny() + .orElse(null); + } + + private static boolean isBom(ByteOrderMark bom, byte[] buffer) { + if (buffer.length < bom.length()) { + return false; + } + for (int i = 0; i < bom.length(); i++) { + if ((byte) bom.get(i) != buffer[i]) { + return false; + } + } + return true; + } + +} diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java index 3586ea9f171..a2eb629c1a7 100644 --- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetDetector.java @@ -22,51 +22,38 @@ package org.sonar.scanner.scan.filesystem; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; -import java.util.LinkedHashSet; -import java.util.Set; +import java.util.Arrays; + +import javax.annotation.CheckForNull; import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.BOMInputStream; public class CharsetDetector { - private static final int BYTES_TO_DECODE = 512; + private static final int BYTES_TO_DECODE = 4192; private Path filePath; - private BOMInputStream stream; + private BufferedInputStream stream; private Charset detectedCharset; - private Charset defaultEncoding; + private Charset userEncoding; - public CharsetDetector(Path filePath, Charset defaultEncoding) { + public CharsetDetector(Path filePath, Charset userEncoding) { this.filePath = filePath; - this.defaultEncoding = defaultEncoding; + this.userEncoding = userEncoding; } public boolean run() { try { - stream = createInputStream(filePath); - if (detectBOM()) { - return true; - } - - if (detectCharset()) { - return true; - } - - detectedCharset = defaultEncoding; - return false; + byte[] buf = readBuffer(); + return detectCharset(buf); } catch (IOException e) { throw new IllegalStateException("Unable to read file " + filePath.toAbsolutePath().toString(), e); } } + @CheckForNull public Charset charset() { assertRun(); return detectedCharset; @@ -77,52 +64,27 @@ public class CharsetDetector { return stream; } - private static BOMInputStream createInputStream(Path path) throws IOException { - BufferedInputStream bufferedStream = new BufferedInputStream(Files.newInputStream(path)); - return new BOMInputStream(bufferedStream, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16LE, - ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE); - } - - private boolean detectBOM() throws IOException { - String charsetName = stream.getBOMCharsetName(); - if (charsetName != null) { - detectedCharset = Charset.forName(charsetName); - return true; - } - return false; - } - - private boolean detectCharset() throws IOException { + private byte[] readBuffer() throws IOException { + stream = new BufferedInputStream(Files.newInputStream(filePath), BYTES_TO_DECODE * 2); stream.mark(BYTES_TO_DECODE); byte[] buf = new byte[BYTES_TO_DECODE]; - int len = IOUtils.read(stream, buf, 0, BYTES_TO_DECODE); + int read = IOUtils.read(stream, buf, 0, BYTES_TO_DECODE); stream.reset(); - - Set charsets = new LinkedHashSet<>(); - charsets.add(defaultEncoding); - charsets.add(StandardCharsets.UTF_8); - charsets.add(Charset.defaultCharset()); - - for (Charset c : charsets) { - if (tryDecode(buf, len, c)) { - detectedCharset = c; - return true; - } - } - return false; + stream.mark(-1); + return Arrays.copyOf(buf, read); } - private static boolean tryDecode(byte[] bytes, int len, Charset charset) throws IOException { - CharsetDecoder decoder = charset.newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - - try { - decoder.decode(ByteBuffer.wrap(bytes, 0, len)); - } catch (CharacterCodingException e) { - return false; + private boolean detectCharset(byte[] buf) throws IOException { + ByteCharsetDetector detector = new ByteCharsetDetector(new CharsetValidation(), userEncoding); + ByteOrderMark bom = detector.detectBOM(buf); + if (bom != null) { + detectedCharset = Charset.forName(bom.getCharsetName()); + stream.skip(bom.length()); + return true; } - return true; + + detectedCharset = detector.detect(buf); + return detectedCharset != null; } private void assertRun() { diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java new file mode 100644 index 00000000000..8dd8f44240d --- /dev/null +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/CharsetValidation.java @@ -0,0 +1,289 @@ +/* + * SonarQube + * Copyright (C) 2009-2017 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.scanner.scan.filesystem; + +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; + +import javax.annotation.CheckForNull; +import javax.annotation.Nullable; + +public class CharsetValidation { + + private static final float UTF_16_NULL_PASS_THRESHOLD = 0.7f; + private static final float UTF_16_NULL_FAIL_THRESHOLD = 0.1f; + + /** + * Checks if an array of bytes looks UTF-16 encoded. + * We look for clues by checking the presence of nulls and new line control chars in both little and big endian byte orders. + * Failing on nulls will greatly reduce FPs if the buffer is actually encoded in UTF-32. + * + * Note that for any unicode between 0-255, UTF-16 encodes it directly in 2 bytes, being the first 0 (null). Since ASCII, ANSI and control chars are + * within this range, we look for number of nulls and see if it is above a certain threshold. + * It's possible to have valid chars that map to the opposite (non-null followed by a null) even though it is very unlike. + * That will happen, for example, for any unicode 0x??00, being ?? between 00 and D7. For this reason, we give a small maximum tolerance + * for opposite nulls (10%). + * + * Line feed code point (0x000A) reversed would be (0x0A00). This code point is reserved and should never be found. + * + */ + public Result isUTF16(byte[] buffer, boolean failOnNull) { + if (buffer.length < 2) { + return Result.INVALID; + } + + int beAscii = 0; + int beLines = 0; + int leAscii = 0; + int leLines = 0; + + for (int i = 0; i < buffer.length / 2; i++) { + // using bytes is fine, since we will compare with positive numbers only + byte c1 = buffer[i * 2]; + byte c2 = buffer[i * 2 + 1]; + + if (c1 == 0) { + if (c2 != 0) { + if (c2 == 0x0a || c2 == 0x0d) { + beLines++; + } + beAscii++; + } else if (failOnNull) { + // it's probably UTF-32 or binary + return Result.INVALID; + } + } else if (c2 == 0) { + leAscii++; + if (c1 == 0x0a || c1 == 0x0d) { + leLines++; + } + } + } + + float beAsciiPerc = beAscii * 2.0f / (float) buffer.length; + float leAsciiPerc = leAscii * 2.0f / (float) buffer.length; + + if (leLines == 0) { + // could be BE + if (beAsciiPerc >= UTF_16_NULL_PASS_THRESHOLD && leAsciiPerc < UTF_16_NULL_FAIL_THRESHOLD) { + return Result.newValid(StandardCharsets.UTF_16BE); + } + if (beLines > 0) { + // this gives FPs for UTF-32 if !failOnNull + return Result.newValid(StandardCharsets.UTF_16BE); + } + } else if (beLines > 0) { + // lines detected with both endiness -> can't be utf-16 + return Result.INVALID; + } + if (beLines == 0) { + // could be BE + if (leAsciiPerc >= UTF_16_NULL_PASS_THRESHOLD && beAsciiPerc < UTF_16_NULL_FAIL_THRESHOLD) { + return Result.newValid(StandardCharsets.UTF_16LE); + } + if (leLines > 0) { + // this gives FPs for UTF-32 if !failOnNull + return Result.newValid(StandardCharsets.UTF_16LE); + } + } + + // if we reach here, means that there wasn't a line feed for a single endiness and we didn't see a strong null pattern for any of the + // endiness. + // It could happen if there are no line feeds in the text and it's a language that does not use ANSI (unicode > 255). + return new Result(Validation.MAYBE, null); + } + + /** + * Checks whether it's a valid UTF-16-encoded buffer. + * Most sequences of bytes of any encoding will be valid UTF-16, so this is not very effective and gives + * often false positives. + * + * Possible 16bit values in UTF-16: + * + * 0x0000-0xD7FF: single 16bit block + * 0xD800-0xDBFF: first block + * 0xDC00-0xDFFF: second block + * 0XE000-0xFFFF: single 16 bit block + * + * The following UTF code points get mapped into 1 or 2 blocks: + * 0x0000 -0xD7FF (0 -55295) : 2 bytes, direct mapping + * 0xE000 -0xFFFF (57344-65535) : 2 bytes, direct mapping + * 0x10000-0x10FFFF (65536-1114111): 2 blocks of 2 bytes (not direct..) + * + * Note that Unicode 55296-57345 (0xD800 to 0xDFFF) are not used, since it's reserved and used in UTF-16 for the high/low surrogates. + * + * We reject 2-byte blocks with 0 (we consider it's binary) even though it's a valid UTF-16 encoding. + * + */ + public boolean isValidUTF16(byte[] buffer) { + return isValidUTF16(buffer, false); + } + + public boolean isValidUTF16(byte[] buffer, boolean le) { + if (buffer.length < 2) { + return false; + } + for (int i = 0; i < buffer.length / 2; i++) { + boolean extraByte = false; + int c = read16bit(buffer, i, le); + + if (c >= 0xD800 && c < 0xDC00) { + // it's a higher surrogate (10 bits) + extraByte = true; + i++; + } else if ((c >= 0xDC00 && c < 0xE000) || c == 0) { + return false; + } + // else it is a simple 2 byte encoding (code points in BMP), and it's valid + + if (extraByte && i < buffer.length / 2) { + c = read16bit(buffer, i, le); + if (c < 0xDC00 || c >= 0xE000) { + // invalid lower surrogate (10 bits) + return false; + } + } + } + return true; + } + + /** + * Checks if a buffer contains only valid UTF8 encoded bytes. + * It's very effective, giving a clear YES/NO, unless it's ASCII (unicode < 127), in which case it returns MAYBE. + * + * + * First byte: + * 0xxxxxxx: only one byte (0-127) + * 110xxxxx: 2 bytes (194-223, as 192/193 are invalid) + * 1110xxxx: 3 bytes (224-239) + * 11110xxx: 4 bytes (240-244) + * + * Bytes 2,3 and 4 are always 10xxxxxx (0x80-0xBF or 128-191). + * + * So depending on the number of significant bits in the unicode code point, the length will be 1,2,3 or 4 bytes: + * 0 -7 bits (0x0000-007F): 1 byte encoding + * 8 -11 bits (0x0080-07FF): 2 bytes encoding + * 12-16 bits (0x0800-FFFF): 3 bytes encoding + * 17-21 bits (0x10000-10FFFF): 4 bytes encoding + */ + public Result isUTF8(byte[] buffer, boolean rejectNulls) { + boolean onlyAscii = true; + + for (int i = 0; i < buffer.length; i++) { + byte len; + // make it unsigned for the comparisons + int c = (0xFF) & buffer[i]; + + if (rejectNulls && c == 0) { + return Result.INVALID; + } + if ((c & 0b10000000) == 0) { + len = 0; + } else if (c >= 194 && c < 224) { + len = 1; + } else if ((c & 0b11110000) == 0b11100000) { + len = 2; + } else if ((c & 0b11111000) == 0b11110000) { + len = 3; + } else { + return Result.INVALID; + } + + while (len > 0) { + i++; + if (i >= buffer.length) { + break; + } + c = (0xFF) & buffer[i]; + onlyAscii = false; + + // first 2 bits should be 10 + if ((c & 0b11000000) != 0b10000000) { + return Result.INVALID; + } + len--; + } + } + + return onlyAscii ? new Result(Validation.MAYBE, StandardCharsets.UTF_8) : Result.newValid(StandardCharsets.UTF_8); + } + + /** + * Tries to use the given charset to decode the byte array. + * @return true if decoding succeeded, false if there was a decoding error. + */ + public boolean tryDecode(byte[] bytes, @Nullable Charset charset) { + if (charset == null) { + return false; + } + CharsetDecoder decoder = charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + try { + decoder.decode(ByteBuffer.wrap(bytes)); + } catch (CharacterCodingException e) { + return false; + } + return true; + } + + private static int read16bit(byte[] buffer, int i, boolean le) { + return le ? (buffer[i / 2] & 0xff) | ((buffer[i / 2 + 1] & 0xff) << 8) + : ((buffer[i / 2] & 0xff) << 8) | (buffer[i / 2 + 1] & 0xff); + } + + public enum Validation { + NO, + YES, + MAYBE + } + + public static class Result { + static final Result INVALID = new Result(Validation.NO, null); + private Validation valid; + private Charset charset; + + public Result(Validation valid, @Nullable Charset charset) { + this.valid = valid; + this.charset = charset; + } + + public static Result newValid(Charset charset) { + return new Result(Validation.YES, charset); + } + + public Validation valid() { + return valid; + } + + /** + * Only non-null if Valid.Yes + */ + @CheckForNull + public Charset charset() { + return charset; + } + } +} diff --git a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java index f8b796688f7..6c408522a09 100644 --- a/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java +++ b/sonar-scanner-engine/src/main/java/org/sonar/scanner/scan/filesystem/MetadataGenerator.java @@ -58,11 +58,16 @@ class MetadataGenerator { * It is an expensive computation, reading the entire file. */ public void setMetadata(final DefaultInputFile inputFile, Charset defaultEncoding) { - CharsetDetector detector = new CharsetDetector(inputFile.path(), defaultEncoding); + CharsetDetector charsetDetector = new CharsetDetector(inputFile.path(), defaultEncoding); try { - detector.run(); - Charset charset = detector.charset(); - InputStream is = detector.inputStream(); + Charset charset; + if (charsetDetector.run()) { + charset = charsetDetector.charset(); + } else { + LOG.debug("Failed to detect a valid charset for file '{}'. Using default charset.", inputFile.relativePath()); + charset = defaultEncoding; + } + InputStream is = charsetDetector.inputStream(); inputFile.setCharset(charset); Metadata metadata = fileMetadata.readMetadata(is, charset, inputFile.absolutePath(), exclusionsScanner.createCharHandlerFor(inputFile.key())); inputFile.setMetadata(metadata); diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java new file mode 100644 index 00000000000..c1da6f08a24 --- /dev/null +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/ByteCharsetDetectorTest.java @@ -0,0 +1,135 @@ +/* + * SonarQube + * Copyright (C) 2009-2017 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.scanner.scan.filesystem; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyBoolean; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.apache.commons.io.ByteOrderMark; +import org.junit.Before; +import org.junit.Test; +import org.sonar.scanner.scan.filesystem.CharsetValidation.Result; +import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation; + +public class ByteCharsetDetectorTest { + private CharsetValidation validation; + private ByteCharsetDetector charsets; + + @Before + public void setUp() { + validation = mock(CharsetValidation.class); + charsets = new ByteCharsetDetector(validation, null); + } + + @Test + public void detectBOM() throws URISyntaxException, IOException { + byte[] b = ByteOrderMark.UTF_16BE.getBytes(); + assertThat(charsets.detectBOM(b)).isEqualTo(ByteOrderMark.UTF_16BE); + + assertThat(charsets.detectBOM(readFile("UTF-8"))).isEqualTo(ByteOrderMark.UTF_8); + assertThat(charsets.detectBOM(readFile("UTF-16BE"))).isEqualTo(ByteOrderMark.UTF_16BE); + assertThat(charsets.detectBOM(readFile("UTF-16LE"))).isEqualTo(ByteOrderMark.UTF_16LE); + assertThat(charsets.detectBOM(readFile("UTF-32BE"))).isEqualTo(ByteOrderMark.UTF_32BE); + assertThat(charsets.detectBOM(readFile("UTF-32LE"))).isEqualTo(ByteOrderMark.UTF_32LE); + } + + private byte[] readFile(String fileName) throws URISyntaxException, IOException { + Path path = Paths.get(this.getClass().getClassLoader().getResource("org/sonar/scanner/scan/filesystem/" + fileName + ".txt").toURI()); + return Files.readAllBytes(path); + } + + @Test + public void tryUTF8First() { + when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_8)); + assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.UTF_8); + } + + @Test + public void tryUTF16heuristics() { + when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID); + when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_16)); + when(validation.isValidUTF16(any(byte[].class), anyBoolean())).thenReturn(true); + + assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.UTF_16); + } + + @Test + public void failAll() { + when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID); + when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null)); + + assertThat(charsets.detect(new byte[1])).isEqualTo(null); + } + + @Test + public void failAnsii() { + when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null)); + when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_16)); + when(validation.isValidUTF16(any(byte[].class), anyBoolean())).thenReturn(true); + + assertThat(charsets.detect(new byte[1])).isEqualTo(null); + } + + @Test + public void tryUserAnsii() { + when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null)); + when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(Result.newValid(StandardCharsets.UTF_16)); + when(validation.isValidUTF16(any(byte[].class), anyBoolean())).thenReturn(true); + when(validation.tryDecode(any(byte[].class), eq(StandardCharsets.ISO_8859_1))).thenReturn(true); + + charsets = new ByteCharsetDetector(validation, StandardCharsets.ISO_8859_1); + assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.ISO_8859_1); + } + + @Test + public void tryOtherUserCharset() { + when(validation.isUTF8(any(byte[].class), anyBoolean())).thenReturn(Result.INVALID); + when(validation.isUTF16(any(byte[].class), anyBoolean())).thenReturn(new Result(Validation.MAYBE, null)); + when(validation.tryDecode(any(byte[].class), eq(StandardCharsets.ISO_8859_1))).thenReturn(true); + + charsets = new ByteCharsetDetector(validation, StandardCharsets.ISO_8859_1); + assertThat(charsets.detect(new byte[1])).isEqualTo(StandardCharsets.ISO_8859_1); + } + + @Test + public void invalidBOM() { + byte[] b1 = {(byte) 0xFF, (byte) 0xFF}; + assertThat(charsets.detectBOM(b1)).isNull(); + + // not enough bytes + byte[] b2 = {(byte) 0xFE}; + assertThat(charsets.detectBOM(b2)).isNull(); + + // empty + byte[] b3 = new byte[0]; + assertThat(charsets.detectBOM(b3)).isNull(); + } +} diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java index 5abdb43764a..aeb38ee7993 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetDetectorTest.java @@ -19,18 +19,25 @@ */ package org.sonar.scanner.scan.filesystem; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static java.nio.charset.StandardCharsets.UTF_16; +import static java.nio.charset.StandardCharsets.UTF_16BE; +import static java.nio.charset.StandardCharsets.UTF_16LE; +import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.OutputStreamWriter; +import java.io.InputStreamReader; import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.List; import java.util.Random; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; @@ -47,33 +54,42 @@ public class CharsetDetectorTest { public void should_detect_charset_from_BOM() { Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/"); - assertThat(detectCharset(basedir.resolve("without_BOM.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.US_ASCII); - assertThat(detectCharset(basedir.resolve("UTF-8.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_8); - assertThat(detectCharset(basedir.resolve("UTF-16BE.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_16BE); - assertThat(detectCharset(basedir.resolve("UTF-16LE.txt"), StandardCharsets.US_ASCII)).isEqualTo(StandardCharsets.UTF_16LE); - assertThat(detectCharset(basedir.resolve("UTF-32BE.txt"), StandardCharsets.US_ASCII)).isEqualTo(MetadataGenerator.UTF_32BE); - assertThat(detectCharset(basedir.resolve("UTF-32LE.txt"), StandardCharsets.US_ASCII)).isEqualTo(MetadataGenerator.UTF_32LE); + assertThat(detectCharset(basedir.resolve("without_BOM.txt"), US_ASCII)).isEqualTo(US_ASCII); + assertThat(detectCharset(basedir.resolve("UTF-8.txt"), US_ASCII)).isEqualTo(UTF_8); + assertThat(detectCharset(basedir.resolve("UTF-16BE.txt"), US_ASCII)).isEqualTo(UTF_16BE); + assertThat(detectCharset(basedir.resolve("UTF-16LE.txt"), US_ASCII)).isEqualTo(UTF_16LE); + assertThat(detectCharset(basedir.resolve("UTF-32BE.txt"), US_ASCII)).isEqualTo(MetadataGenerator.UTF_32BE); + assertThat(detectCharset(basedir.resolve("UTF-32LE.txt"), US_ASCII)).isEqualTo(MetadataGenerator.UTF_32LE); + } + + @Test + public void should_read_files_from_BOM() throws IOException { + Path basedir = Paths.get("src/test/resources/org/sonar/scanner/scan/filesystem/"); + assertThat(readFile(basedir.resolve("without_BOM.txt"), US_ASCII)).isEqualTo("without BOM"); + assertThat(readFile(basedir.resolve("UTF-8.txt"), US_ASCII)).isEqualTo("UTF-8"); + assertThat(readFile(basedir.resolve("UTF-16BE.txt"), US_ASCII)).isEqualTo("UTF-16BE"); + assertThat(readFile(basedir.resolve("UTF-16LE.txt"), US_ASCII)).isEqualTo("UTF-16LE"); + assertThat(readFile(basedir.resolve("UTF-32BE.txt"), US_ASCII)).isEqualTo("UTF-32BE"); + assertThat(readFile(basedir.resolve("UTF-32LE.txt"), US_ASCII)).isEqualTo("UTF-32LE"); } @Test public void always_try_utf8() throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); - try (OutputStreamWriter writer = new OutputStreamWriter(out, "UTF-8")) { - // UTF-16 can't read 1 byte only - writer.write("t"); - } + // this is a valid 2 byte UTF-8. + out.write(194); + out.write(128); Path filePath = temp.newFile().toPath(); Files.write(filePath, out.toByteArray()); - assertThat(detectCharset(filePath, StandardCharsets.UTF_16)).isEqualByComparingTo(StandardCharsets.UTF_8); - + assertThat(detectCharset(filePath, UTF_16)).isEqualTo(UTF_8); } @Test public void fail_if_file_doesnt_exist() { exception.expect(IllegalStateException.class); exception.expectMessage("Unable to read file " + Paths.get("non_existing").toAbsolutePath()); - detectCharset(Paths.get("non_existing"), StandardCharsets.UTF_8); + detectCharset(Paths.get("non_existing"), UTF_8); } @Test @@ -83,9 +99,16 @@ public class CharsetDetectorTest { new Random().nextBytes(b); Files.write(filePath, b); - CharsetDetector detector = new CharsetDetector(filePath, StandardCharsets.UTF_8); + CharsetDetector detector = new CharsetDetector(filePath, UTF_8); assertThat(detector.run()).isFalse(); - assertThat(detector.charset()).isEqualTo(StandardCharsets.UTF_8); + assertThat(detector.charset()).isNull(); + } + + private String readFile(Path file, Charset defaultEncoding) throws IOException { + CharsetDetector detector = new CharsetDetector(file, defaultEncoding); + assertThat(detector.run()).isTrue(); + List readLines = IOUtils.readLines(new InputStreamReader(detector.inputStream(), detector.charset())); + return StringUtils.join(readLines, "\n"); } private Charset detectCharset(Path file, Charset defaultEncoding) { diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java new file mode 100644 index 00000000000..365fffb624b --- /dev/null +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/CharsetValidationTest.java @@ -0,0 +1,183 @@ +/* + * SonarQube + * Copyright (C) 2009-2017 SonarSource SA + * mailto:info AT sonarsource DOT com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ +package org.sonar.scanner.scan.filesystem; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.junit.Before; +import org.junit.Test; +import org.sonar.scanner.scan.filesystem.CharsetValidation.Validation; + +public class CharsetValidationTest { + private CharsetValidation charsets; + + @Before + public void setUp() { + charsets = new CharsetValidation(); + } + + @Test + public void testWithSourceCode() throws IOException, URISyntaxException { + Path path = Paths.get(this.getClass().getClassLoader().getResource("mediumtest/xoo/sample/xources/hello/HelloJava.xoo").toURI()); + List lines = Files.readAllLines(path, StandardCharsets.UTF_8); + String text = lines.stream().collect(StringBuffer::new, StringBuffer::append, StringBuffer::append).toString(); + + byte[] utf8 = encode(text, StandardCharsets.UTF_8); + byte[] utf16be = encode(text, StandardCharsets.UTF_16BE); + byte[] utf16le = encode(text, StandardCharsets.UTF_16LE); + + assertThat(charsets.isUTF8(utf8, true).charset()).isEqualTo(StandardCharsets.UTF_8); + assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE); + assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE); + + assertThat(charsets.isValidUTF16(utf16be, false)).isTrue(); + assertThat(charsets.isValidUTF16(utf16le, true)).isTrue(); + } + + @Test + public void detectUTF16NewLine() throws CharacterCodingException { + // the first char will be encoded with a null on the second byte, but we should still detect it due to the new line + String text = "\uA100" + "\uA212" + "\n"; + + byte[] utf16be = encode(text, StandardCharsets.UTF_16BE); + byte[] utf16le = encode(text, StandardCharsets.UTF_16LE); + byte[] utf8 = encode(text, StandardCharsets.UTF_8); + byte[] utf32 = encode(text, Charset.forName("UTF-32LE")); + + System.out.println(Arrays.toString(utf32)); + + assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE); + assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE); + assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE); + // this will have a double null, so it will be yes or no based on failOnNull + assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO); + assertThat(charsets.isUTF16(utf32, false).valid()).isEqualTo(Validation.YES); + } + + @Test + public void detectUTF16Ascii() throws CharacterCodingException { + String text = "some text to test"; + byte[] utf16be = encode(text, StandardCharsets.UTF_16BE); + byte[] utf16le = encode(text, StandardCharsets.UTF_16LE); + byte[] utf8 = encode(text, StandardCharsets.UTF_8); + byte[] iso88591 = encode(text, StandardCharsets.ISO_8859_1); + byte[] utf32 = encode(text, Charset.forName("UTF-32LE")); + + assertThat(charsets.isUTF16(utf16le, true).charset()).isEqualTo(StandardCharsets.UTF_16LE); + assertThat(charsets.isUTF16(utf16be, true).charset()).isEqualTo(StandardCharsets.UTF_16BE); + // not enough nulls -> we don't know + assertThat(charsets.isUTF16(iso88591, true).valid()).isEqualTo(Validation.MAYBE); + assertThat(charsets.isUTF16(utf8, true).valid()).isEqualTo(Validation.MAYBE); + // fail based on double nulls + assertThat(charsets.isUTF16(utf32, true).valid()).isEqualTo(Validation.NO); + } + + @Test + public void validUTF8() { + // UTF8 with 3 bytes + byte[] b = hexToByte("E2 80 A6"); + assertThat(charsets.isUTF8(b, true).valid()).isEqualTo(Validation.YES); + } + + @Test + public void invalidUTF16() { + // UTF-16 will accept anything in direct 2 byte block unless it's between D800-DFFF (high and low surrogates). + // In that case, it's a 4 byte encoding it's not a direct encoding. + byte[] b1 = hexToByte("D800 0000"); + assertThat(charsets.isValidUTF16(b1)).isFalse(); + + byte[] b1le = hexToByte("0000 D800"); + assertThat(charsets.isValidUTF16(b1le, true)).isFalse(); + + // not enough bytes (any byte following this one would make it valid) + byte[] b2 = {(byte) 0x01}; + assertThat(charsets.isValidUTF16(b2)).isFalse(); + + // we reject double 0 + byte[] b3 = {(byte) 0, (byte) 0}; + assertThat(charsets.isValidUTF16(b3)).isFalse(); + } + + @Test + public void invalidUTF8() { + // never expects to see 0xFF or 0xC0.. + byte[] b1 = {(byte) 0xFF}; + assertThat(charsets.isUTF8(b1, true).valid()).isEqualTo(Validation.NO); + + byte[] b1c = {(byte) 0xC0}; + assertThat(charsets.isUTF8(b1c, true).valid()).isEqualTo(Validation.NO); + + // the first byte indicates a 2-byte encoding, but second byte is not valid + byte[] b2 = {(byte) 0b11000010, (byte) 0b11000000}; + assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.NO); + + // we reject nulls (mainly to reject UTF-16) + byte[] b3 = {(byte) 0}; + assertThat(charsets.isUTF8(b3, true).valid()).isEqualTo(Validation.NO); + } + + @Test + public void dontFailIfNotEnoughBytes() { + byte[] b1 = hexToByte("D800"); + assertThat(charsets.isValidUTF16(b1)).isTrue(); + + // the first byte indicates a 2-byte encoding, but there is no second byte + byte[] b2 = {(byte) 0b11000010}; + assertThat(charsets.isUTF8(b2, true).valid()).isEqualTo(Validation.MAYBE); + } + + private byte[] encode(String txt, Charset charset) throws CharacterCodingException { + CharsetEncoder encoder = charset.newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + ByteBuffer encoded = encoder.encode(CharBuffer.wrap(txt)); + byte[] b = new byte[encoded.remaining()]; + encoded.get(b); + return b; + } + + private static byte[] hexToByte(String str) { + String s = StringUtils.deleteWhitespace(str); + int len = s.length(); + byte[] data = new byte[len / 2]; + for (int i = 0; i < len; i += 2) { + data[i / 2] = (byte) ((Character.digit(s.charAt(i), 16) << 4) + + Character.digit(s.charAt(i + 1), 16)); + } + return data; + } + +} diff --git a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java index c552cf75610..697450bd782 100644 --- a/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java +++ b/sonar-scanner-engine/src/test/java/org/sonar/scanner/scan/filesystem/MetadataGeneratorTest.java @@ -24,6 +24,7 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.nio.file.Paths; @@ -108,6 +109,15 @@ public class MetadataGeneratorTest { assertThat(inputFile.originalLineOffsets()).containsOnly(0, 4, 9); } + @Test + public void use_default_charset_if_detection_fails() throws IOException { + Path tempFile = temp.newFile().toPath(); + byte[] b = {(byte) 0xDF, (byte) 0xFF, (byte) 0xFF}; + FileUtils.writeByteArrayToFile(tempFile.toFile(), b); + DefaultInputFile inputFile = createInputFileWithMetadata(tempFile); + assertThat(inputFile.charset()).isEqualTo(StandardCharsets.US_ASCII); + } + @Test public void non_existing_file_should_throw_exception() { try { diff --git a/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt b/sonar-scanner-engine/src/test/resources/org/sonar/scanner/scan/filesystem/UTF-32LE.txt index 6c34c65bd49a0af380f5bb2d331dbdb34c50e2ad..a89e27e4f25b049c903890f327f4a56efa325e6b 100644 GIT binary patch delta 13 ScmdN;nIOaA!@$7c3d8^o$pMl8 delta 17 VcmY$|RB#K6Gd3dCGM3;-1I0hRy& -- 2.39.5