You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

charset_test.go 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. // Copyright 2019 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package charset
  5. import (
  6. "strings"
  7. "testing"
  8. "code.gitea.io/gitea/modules/setting"
  9. "github.com/stretchr/testify/assert"
  10. )
  11. func resetDefaultCharsetsOrder() {
  12. defaultDetectedCharsetsOrder := make([]string, 0, len(setting.Repository.DetectedCharsetsOrder))
  13. for _, charset := range setting.Repository.DetectedCharsetsOrder {
  14. defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset)))
  15. }
  16. setting.Repository.DetectedCharsetScore = map[string]int{}
  17. i := 0
  18. for _, charset := range defaultDetectedCharsetsOrder {
  19. canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
  20. if _, has := setting.Repository.DetectedCharsetScore[canonicalCharset]; !has {
  21. setting.Repository.DetectedCharsetScore[canonicalCharset] = i
  22. i++
  23. }
  24. }
  25. }
  26. func TestRemoveBOMIfPresent(t *testing.T) {
  27. res := RemoveBOMIfPresent([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  28. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  29. res = RemoveBOMIfPresent([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  30. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  31. }
  32. func TestToUTF8WithErr(t *testing.T) {
  33. resetDefaultCharsetsOrder()
  34. var res string
  35. var err error
  36. // Note: golang compiler seems so behave differently depending on the current
  37. // locale, so some conversions might behave differently. For that reason, we don't
  38. // depend on particular conversions but in expected behaviors.
  39. res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
  40. assert.NoError(t, err)
  41. assert.Equal(t, "ABC", res)
  42. // "áéíóú"
  43. res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  44. assert.NoError(t, err)
  45. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  46. // "áéíóú"
  47. res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
  48. 0xc3, 0xba})
  49. assert.NoError(t, err)
  50. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  51. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  52. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  53. assert.NoError(t, err)
  54. stringMustStartWith(t, "Hola,", res)
  55. stringMustEndWith(t, "AAA.", res)
  56. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  57. 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  58. assert.NoError(t, err)
  59. stringMustStartWith(t, "Hola,", res)
  60. stringMustEndWith(t, "AAA.", res)
  61. res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  62. 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
  63. assert.NoError(t, err)
  64. stringMustStartWith(t, "Hola,", res)
  65. stringMustEndWith(t, "AAA.", res)
  66. // Japanese (Shift-JIS)
  67. // 日属秘ぞしちゅ。
  68. res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
  69. 0xBF, 0x82, 0xE3, 0x81, 0x42})
  70. assert.NoError(t, err)
  71. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  72. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
  73. []byte(res))
  74. res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
  75. assert.NoError(t, err)
  76. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
  77. }
  78. func TestToUTF8WithFallback(t *testing.T) {
  79. resetDefaultCharsetsOrder()
  80. // "ABC"
  81. res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43})
  82. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  83. // "áéíóú"
  84. res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  85. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  86. // UTF8 BOM + "áéíóú"
  87. res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  88. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  89. // "Hola, así cómo ños"
  90. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  91. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  92. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63,
  93. 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
  94. // "Hola, así cómo "
  95. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  96. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  97. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  98. assert.Equal(t, minmatch, res[0:len(minmatch)])
  99. res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  100. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  101. assert.Equal(t, minmatch, res[0:len(minmatch)])
  102. // Japanese (Shift-JIS)
  103. // "日属秘ぞしちゅ。"
  104. res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  105. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  106. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  107. res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00})
  108. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  109. }
  110. func TestToUTF8(t *testing.T) {
  111. resetDefaultCharsetsOrder()
  112. // Note: golang compiler seems so behave differently depending on the current
  113. // locale, so some conversions might behave differently. For that reason, we don't
  114. // depend on particular conversions but in expected behaviors.
  115. res := ToUTF8(string([]byte{0x41, 0x42, 0x43}))
  116. assert.Equal(t, "ABC", res)
  117. // "áéíóú"
  118. res = ToUTF8(string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}))
  119. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  120. // BOM + "áéíóú"
  121. res = ToUTF8(string([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
  122. 0xc3, 0xba}))
  123. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
  124. // Latin1
  125. // Hola, así cómo ños
  126. res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  127. 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}))
  128. assert.Equal(t, []byte{0x48, 0x6f, 0x6c, 0x61, 0x2c, 0x20, 0x61, 0x73, 0xc3, 0xad, 0x20, 0x63,
  129. 0xc3, 0xb3, 0x6d, 0x6f, 0x20, 0xc3, 0xb1, 0x6f, 0x73}, []byte(res))
  130. // Latin1
  131. // Hola, así cómo \x07ños
  132. res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
  133. 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}))
  134. // Hola,
  135. bytesMustStartWith(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C}, []byte(res))
  136. // This test FAILS
  137. // res = ToUTF8("Hola, así cómo \x81ños")
  138. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  139. // assert.Regexp(t, "^Hola, así cómo", res)
  140. // Japanese (Shift-JIS)
  141. // 日属秘ぞしちゅ。
  142. res = ToUTF8(string([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
  143. 0xBF, 0x82, 0xE3, 0x81, 0x42}))
  144. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  145. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
  146. []byte(res))
  147. res = ToUTF8("\x00\x00\x00\x00")
  148. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
  149. }
  150. func TestToUTF8DropErrors(t *testing.T) {
  151. resetDefaultCharsetsOrder()
  152. // "ABC"
  153. res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
  154. assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
  155. // "áéíóú"
  156. res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  157. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  158. // UTF8 BOM + "áéíóú"
  159. res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
  160. assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
  161. // "Hola, así cómo ños"
  162. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
  163. assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8])
  164. assert.Equal(t, []byte{0x73}, res[len(res)-1:])
  165. // "Hola, así cómo "
  166. minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
  167. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
  168. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  169. assert.Equal(t, minmatch, res[0:len(minmatch)])
  170. res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
  171. // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
  172. assert.Equal(t, minmatch, res[0:len(minmatch)])
  173. // Japanese (Shift-JIS)
  174. // "日属秘ぞしちゅ。"
  175. res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
  176. assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
  177. 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82}, res)
  178. res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
  179. assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
  180. }
  181. func TestDetectEncoding(t *testing.T) {
  182. resetDefaultCharsetsOrder()
  183. testSuccess := func(b []byte, expected string) {
  184. encoding, err := DetectEncoding(b)
  185. assert.NoError(t, err)
  186. assert.Equal(t, expected, encoding)
  187. }
  188. // utf-8
  189. b := []byte("just some ascii")
  190. testSuccess(b, "UTF-8")
  191. // utf-8-sig: "hey" (with BOM)
  192. b = []byte{0xef, 0xbb, 0xbf, 0x68, 0x65, 0x79}
  193. testSuccess(b, "UTF-8")
  194. // utf-16: "hey<accented G>"
  195. b = []byte{0xff, 0xfe, 0x68, 0x00, 0x65, 0x00, 0x79, 0x00, 0xf4, 0x01}
  196. testSuccess(b, "UTF-16LE")
  197. // iso-8859-1: d<accented e>cor<newline>
  198. b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
  199. encoding, err := DetectEncoding(b)
  200. assert.NoError(t, err)
  201. assert.Contains(t, encoding, "ISO-8859-1")
  202. old := setting.Repository.AnsiCharset
  203. setting.Repository.AnsiCharset = "placeholder"
  204. defer func() {
  205. setting.Repository.AnsiCharset = old
  206. }()
  207. testSuccess(b, "placeholder")
  208. // invalid bytes
  209. b = []byte{0xfa}
  210. _, err = DetectEncoding(b)
  211. assert.Error(t, err)
  212. }
  213. func stringMustStartWith(t *testing.T, expected string, value string) {
  214. assert.Equal(t, expected, string(value[:len(expected)]))
  215. }
  216. func stringMustEndWith(t *testing.T, expected string, value string) {
  217. assert.Equal(t, expected, string(value[len(value)-len(expected):]))
  218. }
  219. func bytesMustStartWith(t *testing.T, expected []byte, value []byte) {
  220. assert.Equal(t, expected, value[:len(expected)])
  221. }