Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

escape_stream.go 7.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. // Copyright 2022 The Gitea Authors. All rights reserved.
  2. // Use of this source code is governed by a MIT-style
  3. // license that can be found in the LICENSE file.
  4. package charset
  5. import (
  6. "fmt"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "unicode"
  11. "unicode/utf8"
  12. "code.gitea.io/gitea/modules/translation"
  13. "golang.org/x/net/html"
  14. )
  15. // VScode defaultWordRegexp
  16. var defaultWordRegexp = regexp.MustCompile(`(-?\d*\.\d\w*)|([^\` + "`" + `\~\!\@\#\$\%\^\&\*\(\)\-\=\+\[\{\]\}\\\|\;\:\'\"\,\.\<\>\/\?\s\x00-\x1f]+)`)
  17. func NewEscapeStreamer(locale translation.Locale, next HTMLStreamer, allowed ...rune) HTMLStreamer {
  18. return &escapeStreamer{
  19. escaped: &EscapeStatus{},
  20. PassthroughHTMLStreamer: *NewPassthroughStreamer(next),
  21. locale: locale,
  22. ambiguousTables: AmbiguousTablesForLocale(locale),
  23. allowed: allowed,
  24. }
  25. }
  26. type escapeStreamer struct {
  27. PassthroughHTMLStreamer
  28. escaped *EscapeStatus
  29. locale translation.Locale
  30. ambiguousTables []*AmbiguousTable
  31. allowed []rune
  32. }
  33. func (e *escapeStreamer) EscapeStatus() *EscapeStatus {
  34. return e.escaped
  35. }
  36. // Text tells the next streamer there is a text
  37. func (e *escapeStreamer) Text(data string) error {
  38. sb := &strings.Builder{}
  39. pos, until, next := 0, 0, 0
  40. if len(data) > len(UTF8BOM) && data[:len(UTF8BOM)] == string(UTF8BOM) {
  41. _, _ = sb.WriteString(data[:len(UTF8BOM)])
  42. pos = len(UTF8BOM)
  43. }
  44. dataBytes := []byte(data)
  45. for pos < len(data) {
  46. nextIdxs := defaultWordRegexp.FindStringIndex(data[pos:])
  47. if nextIdxs == nil {
  48. until = len(data)
  49. next = until
  50. } else {
  51. until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
  52. }
  53. // from pos until until we know that the runes are not \r\t\n or even ' '
  54. runes := make([]rune, 0, next-until)
  55. positions := make([]int, 0, next-until+1)
  56. for pos < until {
  57. r, sz := utf8.DecodeRune(dataBytes[pos:])
  58. positions = positions[:0]
  59. positions = append(positions, pos, pos+sz)
  60. types, confusables, _ := e.runeTypes(r)
  61. if err := e.handleRunes(dataBytes, []rune{r}, positions, types, confusables, sb); err != nil {
  62. return err
  63. }
  64. pos += sz
  65. }
  66. for i := pos; i < next; {
  67. r, sz := utf8.DecodeRune(dataBytes[i:])
  68. runes = append(runes, r)
  69. positions = append(positions, i)
  70. i += sz
  71. }
  72. positions = append(positions, next)
  73. types, confusables, runeCounts := e.runeTypes(runes...)
  74. if runeCounts.needsEscape() {
  75. if err := e.handleRunes(dataBytes, runes, positions, types, confusables, sb); err != nil {
  76. return err
  77. }
  78. } else {
  79. _, _ = sb.Write(dataBytes[pos:next])
  80. }
  81. pos = next
  82. }
  83. if sb.Len() > 0 {
  84. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  85. return err
  86. }
  87. }
  88. return nil
  89. }
  90. func (e *escapeStreamer) handleRunes(data []byte, runes []rune, positions []int, types []runeType, confusables []rune, sb *strings.Builder) error {
  91. for i, r := range runes {
  92. switch types[i] {
  93. case brokenRuneType:
  94. if sb.Len() > 0 {
  95. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  96. return err
  97. }
  98. sb.Reset()
  99. }
  100. end := positions[i+1]
  101. start := positions[i]
  102. if err := e.brokenRune(data[start:end]); err != nil {
  103. return err
  104. }
  105. case ambiguousRuneType:
  106. if sb.Len() > 0 {
  107. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  108. return err
  109. }
  110. sb.Reset()
  111. }
  112. if err := e.ambiguousRune(r, confusables[0]); err != nil {
  113. return err
  114. }
  115. confusables = confusables[1:]
  116. case invisibleRuneType:
  117. if sb.Len() > 0 {
  118. if err := e.PassthroughHTMLStreamer.Text(sb.String()); err != nil {
  119. return err
  120. }
  121. sb.Reset()
  122. }
  123. if err := e.invisibleRune(r); err != nil {
  124. return err
  125. }
  126. default:
  127. _, _ = sb.WriteRune(r)
  128. }
  129. }
  130. return nil
  131. }
  132. func (e *escapeStreamer) brokenRune(bs []byte) error {
  133. e.escaped.Escaped = true
  134. e.escaped.HasBadRunes = true
  135. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  136. Key: "class",
  137. Val: "broken-code-point",
  138. }); err != nil {
  139. return err
  140. }
  141. if err := e.PassthroughHTMLStreamer.Text(fmt.Sprintf("<%X>", bs)); err != nil {
  142. return err
  143. }
  144. return e.PassthroughHTMLStreamer.EndTag("span")
  145. }
  146. func (e *escapeStreamer) ambiguousRune(r, c rune) error {
  147. e.escaped.Escaped = true
  148. e.escaped.HasAmbiguous = true
  149. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  150. Key: "class",
  151. Val: "ambiguous-code-point tooltip",
  152. }, html.Attribute{
  153. Key: "data-content",
  154. Val: e.locale.Tr("repo.ambiguous_character", r, c),
  155. }); err != nil {
  156. return err
  157. }
  158. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  159. Key: "class",
  160. Val: "char",
  161. }); err != nil {
  162. return err
  163. }
  164. if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
  165. return err
  166. }
  167. if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
  168. return err
  169. }
  170. return e.PassthroughHTMLStreamer.EndTag("span")
  171. }
  172. func (e *escapeStreamer) invisibleRune(r rune) error {
  173. e.escaped.Escaped = true
  174. e.escaped.HasInvisible = true
  175. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  176. Key: "class",
  177. Val: "escaped-code-point",
  178. }, html.Attribute{
  179. Key: "data-escaped",
  180. Val: fmt.Sprintf("[U+%04X]", r),
  181. }); err != nil {
  182. return err
  183. }
  184. if err := e.PassthroughHTMLStreamer.StartTag("span", html.Attribute{
  185. Key: "class",
  186. Val: "char",
  187. }); err != nil {
  188. return err
  189. }
  190. if err := e.PassthroughHTMLStreamer.Text(string(r)); err != nil {
  191. return err
  192. }
  193. if err := e.PassthroughHTMLStreamer.EndTag("span"); err != nil {
  194. return err
  195. }
  196. return e.PassthroughHTMLStreamer.EndTag("span")
  197. }
  198. type runeCountType struct {
  199. numBasicRunes int
  200. numNonConfusingNonBasicRunes int
  201. numAmbiguousRunes int
  202. numInvisibleRunes int
  203. numBrokenRunes int
  204. }
  205. func (counts runeCountType) needsEscape() bool {
  206. if counts.numBrokenRunes > 0 {
  207. return true
  208. }
  209. if counts.numBasicRunes == 0 &&
  210. counts.numNonConfusingNonBasicRunes > 0 {
  211. return false
  212. }
  213. return counts.numAmbiguousRunes > 0 || counts.numInvisibleRunes > 0
  214. }
  215. type runeType int
  216. const (
  217. basicASCIIRuneType runeType = iota //nolint // <- This is technically deadcode but its self-documenting so it should stay
  218. brokenRuneType
  219. nonBasicASCIIRuneType
  220. ambiguousRuneType
  221. invisibleRuneType
  222. )
  223. func (e *escapeStreamer) runeTypes(runes ...rune) (types []runeType, confusables []rune, runeCounts runeCountType) {
  224. types = make([]runeType, len(runes))
  225. for i, r := range runes {
  226. var confusable rune
  227. switch {
  228. case r == utf8.RuneError:
  229. types[i] = brokenRuneType
  230. runeCounts.numBrokenRunes++
  231. case r == ' ' || r == '\t' || r == '\n':
  232. runeCounts.numBasicRunes++
  233. case e.isAllowed(r):
  234. if r > 0x7e || r < 0x20 {
  235. types[i] = nonBasicASCIIRuneType
  236. runeCounts.numNonConfusingNonBasicRunes++
  237. } else {
  238. runeCounts.numBasicRunes++
  239. }
  240. case unicode.Is(InvisibleRanges, r):
  241. types[i] = invisibleRuneType
  242. runeCounts.numInvisibleRunes++
  243. case unicode.IsControl(r):
  244. types[i] = invisibleRuneType
  245. runeCounts.numInvisibleRunes++
  246. case isAmbiguous(r, &confusable, e.ambiguousTables...):
  247. confusables = append(confusables, confusable)
  248. types[i] = ambiguousRuneType
  249. runeCounts.numAmbiguousRunes++
  250. case r > 0x7e || r < 0x20:
  251. types[i] = nonBasicASCIIRuneType
  252. runeCounts.numNonConfusingNonBasicRunes++
  253. default:
  254. runeCounts.numBasicRunes++
  255. }
  256. }
  257. return types, confusables, runeCounts
  258. }
  259. func (e *escapeStreamer) isAllowed(r rune) bool {
  260. if len(e.allowed) == 0 {
  261. return false
  262. }
  263. if len(e.allowed) == 1 {
  264. return e.allowed[0] == r
  265. }
  266. return sort.Search(len(e.allowed), func(i int) bool {
  267. return e.allowed[i] >= r
  268. }) >= 0
  269. }