You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

CharsetDetector.java 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. /*
  2. * SonarQube
  3. * Copyright (C) 2009-2019 SonarSource SA
  4. * mailto:info AT sonarsource DOT com
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 3 of the License, or (at your option) any later version.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public License
  17. * along with this program; if not, write to the Free Software Foundation,
  18. * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. package org.sonar.scanner.scan.filesystem;
  21. import java.io.BufferedInputStream;
  22. import java.io.IOException;
  23. import java.io.InputStream;
  24. import java.nio.charset.Charset;
  25. import java.nio.file.Files;
  26. import java.nio.file.Path;
  27. import java.util.Arrays;
  28. import javax.annotation.CheckForNull;
  29. import org.apache.commons.io.ByteOrderMark;
  30. import org.apache.commons.io.IOUtils;
  31. public class CharsetDetector {
  32. private static final int BYTES_TO_DECODE = 4192;
  33. private final Path filePath;
  34. private BufferedInputStream stream;
  35. private Charset detectedCharset;
  36. private Charset userEncoding;
  37. public CharsetDetector(Path filePath, Charset userEncoding) {
  38. this.filePath = filePath;
  39. this.userEncoding = userEncoding;
  40. }
  41. public boolean run() {
  42. try {
  43. byte[] buf = readBuffer();
  44. return detectCharset(buf);
  45. } catch (IOException e) {
  46. throw new IllegalStateException("Unable to read file " + filePath.toAbsolutePath().toString(), e);
  47. }
  48. }
  49. @CheckForNull
  50. public Charset charset() {
  51. assertRun();
  52. return detectedCharset;
  53. }
  54. public InputStream inputStream() {
  55. assertRun();
  56. return stream;
  57. }
  58. private byte[] readBuffer() throws IOException {
  59. stream = new BufferedInputStream(Files.newInputStream(filePath), BYTES_TO_DECODE * 2);
  60. stream.mark(BYTES_TO_DECODE);
  61. byte[] buf = new byte[BYTES_TO_DECODE];
  62. int read = IOUtils.read(stream, buf, 0, BYTES_TO_DECODE);
  63. stream.reset();
  64. stream.mark(-1);
  65. return Arrays.copyOf(buf, read);
  66. }
  67. private boolean detectCharset(byte[] buf) throws IOException {
  68. ByteCharsetDetector detector = new ByteCharsetDetector(new CharsetValidation(), userEncoding);
  69. ByteOrderMark bom = detector.detectBOM(buf);
  70. if (bom != null) {
  71. detectedCharset = Charset.forName(bom.getCharsetName());
  72. stream.skip(bom.length());
  73. return true;
  74. }
  75. detectedCharset = detector.detect(buf);
  76. return detectedCharset != null;
  77. }
  78. private void assertRun() {
  79. if (stream == null) {
  80. throw new IllegalStateException("Charset detection did not run");
  81. }
  82. }
  83. }