You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

charset.go 6.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. // Copyright 2014 The Gogs Authors. All rights reserved.
  2. // SPDX-License-Identifier: MIT
  3. package charset
  4. import (
  5. "bytes"
  6. "fmt"
  7. "io"
  8. "strings"
  9. "unicode/utf8"
  10. "code.gitea.io/gitea/modules/log"
  11. "code.gitea.io/gitea/modules/setting"
  12. "code.gitea.io/gitea/modules/util"
  13. "github.com/gogs/chardet"
  14. "golang.org/x/net/html/charset"
  15. "golang.org/x/text/transform"
  16. )
  17. // UTF8BOM is the utf-8 byte-order marker
  18. var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
  19. type ConvertOpts struct {
  20. KeepBOM bool
  21. }
  22. // ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible
  23. func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader {
  24. buf := make([]byte, 2048)
  25. n, err := util.ReadAtMost(rd, buf)
  26. if err != nil {
  27. return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
  28. }
  29. charsetLabel, err := DetectEncoding(buf[:n])
  30. if err != nil || charsetLabel == "UTF-8" {
  31. return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
  32. }
  33. encoding, _ := charset.Lookup(charsetLabel)
  34. if encoding == nil {
  35. return io.MultiReader(bytes.NewReader(buf[:n]), rd)
  36. }
  37. return transform.NewReader(
  38. io.MultiReader(
  39. bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)),
  40. rd,
  41. ),
  42. encoding.NewDecoder(),
  43. )
  44. }
  45. // ToUTF8 converts content to UTF8 encoding
  46. func ToUTF8(content []byte, opts ConvertOpts) (string, error) {
  47. charsetLabel, err := DetectEncoding(content)
  48. if err != nil {
  49. return "", err
  50. } else if charsetLabel == "UTF-8" {
  51. return string(MaybeRemoveBOM(content, opts)), nil
  52. }
  53. encoding, _ := charset.Lookup(charsetLabel)
  54. if encoding == nil {
  55. return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
  56. }
  57. // If there is an error, we concatenate the nicely decoded part and the
  58. // original left over. This way we won't lose much data.
  59. result, n, err := transform.Bytes(encoding.NewDecoder(), content)
  60. if err != nil {
  61. result = append(result, content[n:]...)
  62. }
  63. result = MaybeRemoveBOM(result, opts)
  64. return string(result), err
  65. }
  66. // ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible
  67. func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte {
  68. bs, _ := io.ReadAll(ToUTF8WithFallbackReader(bytes.NewReader(content), opts))
  69. return bs
  70. }
  71. // ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
  72. func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte {
  73. charsetLabel, err := DetectEncoding(content)
  74. if err != nil || charsetLabel == "UTF-8" {
  75. return MaybeRemoveBOM(content, opts)
  76. }
  77. encoding, _ := charset.Lookup(charsetLabel)
  78. if encoding == nil {
  79. return content
  80. }
  81. // We ignore any non-decodable parts from the file.
  82. // Some parts might be lost
  83. var decoded []byte
  84. decoder := encoding.NewDecoder()
  85. idx := 0
  86. for {
  87. result, n, err := transform.Bytes(decoder, content[idx:])
  88. decoded = append(decoded, result...)
  89. if err == nil {
  90. break
  91. }
  92. decoded = append(decoded, ' ')
  93. idx = idx + n + 1
  94. if idx >= len(content) {
  95. break
  96. }
  97. }
  98. return MaybeRemoveBOM(decoded, opts)
  99. }
  100. // MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
  101. func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
  102. if opts.KeepBOM {
  103. return content
  104. }
  105. if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
  106. return content[3:]
  107. }
  108. return content
  109. }
  110. // DetectEncoding detect the encoding of content
  111. func DetectEncoding(content []byte) (string, error) {
  112. // First we check if the content represents valid utf8 content excepting a truncated character at the end.
  113. // Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
  114. // instead we walk backwards from the end to trim off a the incomplete character
  115. toValidate := content
  116. end := len(toValidate) - 1
  117. if end < 0 {
  118. // no-op
  119. } else if toValidate[end]>>5 == 0b110 {
  120. // Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2>
  121. toValidate = toValidate[:end]
  122. } else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 {
  123. // Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b>
  124. toValidate = toValidate[:end-1]
  125. } else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 {
  126. // Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>
  127. toValidate = toValidate[:end-2]
  128. }
  129. if utf8.Valid(toValidate) {
  130. log.Debug("Detected encoding: utf-8 (fast)")
  131. return "UTF-8", nil
  132. }
  133. textDetector := chardet.NewTextDetector()
  134. var detectContent []byte
  135. if len(content) < 1024 {
  136. // Check if original content is valid
  137. if _, err := textDetector.DetectBest(content); err != nil {
  138. return "", err
  139. }
  140. times := 1024 / len(content)
  141. detectContent = make([]byte, 0, times*len(content))
  142. for i := 0; i < times; i++ {
  143. detectContent = append(detectContent, content...)
  144. }
  145. } else {
  146. detectContent = content
  147. }
  148. // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
  149. results, err := textDetector.DetectAll(detectContent)
  150. if err != nil {
  151. if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
  152. log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
  153. return setting.Repository.AnsiCharset, nil
  154. }
  155. return "", err
  156. }
  157. topConfidence := results[0].Confidence
  158. topResult := results[0]
  159. priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
  160. for _, result := range results {
  161. // As results are sorted in confidence order - if we have a different confidence
  162. // we know it's less than the current confidence and can break out of the loop early
  163. if result.Confidence != topConfidence {
  164. break
  165. }
  166. // Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guess
  167. resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
  168. if resultHas && (!has || resultPriority < priority) {
  169. topResult = result
  170. priority = resultPriority
  171. has = true
  172. }
  173. }
  174. // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
  175. if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
  176. log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
  177. return setting.Repository.AnsiCharset, err
  178. }
  179. log.Debug("Detected encoding: %s", topResult.Charset)
  180. return topResult.Charset, err
  181. }