diff options
author | zeripath <art27@cantab.net> | 2022-01-07 01:18:52 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-07 02:18:52 +0100 |
commit | 21ed4fd8da4c8992518dcfb01aa7306f7406f735 (patch) | |
tree | eb0bdaed8d06849116818f058b6120633d329d69 /modules/charset/escape.go | |
parent | ee60f27aec0f75a34ae62841ed52579c0c20dcfa (diff) | |
download | gitea-21ed4fd8da4c8992518dcfb01aa7306f7406f735.tar.gz gitea-21ed4fd8da4c8992518dcfb01aa7306f7406f735.zip |
Add warning for BIDI characters in page renders and in diffs (#17562)
Fix #17514
Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces.
There is a button which can be used to escape the content to show it.
Signed-off-by: Andrew Thornton <art27@cantab.net>
Co-authored-by: Gwyneth Morgan <gwymor@tilde.club>
Co-authored-by: silverwind <me@silverwind.io>
Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
Diffstat (limited to 'modules/charset/escape.go')
-rw-r--r-- | modules/charset/escape.go | 230 |
1 files changed, 230 insertions, 0 deletions
diff --git a/modules/charset/escape.go b/modules/charset/escape.go new file mode 100644 index 0000000000..abe813b465 --- /dev/null +++ b/modules/charset/escape.go @@ -0,0 +1,230 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package charset + +import ( + "bytes" + "fmt" + "io" + "strings" + "unicode" + "unicode/utf8" + + "golang.org/x/text/unicode/bidi" +) + +// EscapeStatus represents the findings of the unicode escaper +type EscapeStatus struct { + Escaped bool + HasError bool + HasBadRunes bool + HasControls bool + HasSpaces bool + HasMarks bool + HasBIDI bool + BadBIDI bool + HasRTLScript bool + HasLTRScript bool +} + +// Or combines two EscapeStatus structs into one representing the conjunction of the two +func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus { + st := status + st.Escaped = st.Escaped || other.Escaped + st.HasError = st.HasError || other.HasError + st.HasBadRunes = st.HasBadRunes || other.HasBadRunes + st.HasControls = st.HasControls || other.HasControls + st.HasSpaces = st.HasSpaces || other.HasSpaces + st.HasMarks = st.HasMarks || other.HasMarks + st.HasBIDI = st.HasBIDI || other.HasBIDI + st.BadBIDI = st.BadBIDI || other.BadBIDI + st.HasRTLScript = st.HasRTLScript || other.HasRTLScript + st.HasLTRScript = st.HasLTRScript || other.HasLTRScript + return st +} + +// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string +func EscapeControlString(text string) (EscapeStatus, string) { + sb := &strings.Builder{} + escaped, _ := EscapeControlReader(strings.NewReader(text), sb) + return escaped, sb.String() +} + +// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte +func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { + buf := &bytes.Buffer{} + escaped, _ := EscapeControlReader(bytes.NewReader(text), buf) + return escaped, buf.Bytes() +} + +// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error +func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { + buf := make([]byte, 4096) + readStart := 0 + var n int + var writePos int + + lineHasBIDI := false + lineHasRTLScript := false + lineHasLTRScript := false + +readingloop: + for err == nil { + n, err = text.Read(buf[readStart:]) + bs := buf[:n+readStart] + i := 0 + + for i < len(bs) { + r, size := utf8.DecodeRune(bs[i:]) + // Now handle the codepoints + switch { + case r == utf8.RuneError: + if writePos < i { + if _, err = output.Write(bs[writePos:i]); err != nil { + escaped.HasError = true + return + } + writePos = i + } + // runes can be at most 4 bytes - so... + if len(bs)-i <= 3 { + // if not request more data + copy(buf, bs[i:]) + readStart = n - i + writePos = 0 + continue readingloop + } + // this is a real broken rune + escaped.HasBadRunes = true + escaped.Escaped = true + if err = writeBroken(output, bs[i:i+size]); err != nil { + escaped.HasError = true + return + } + writePos += size + case r == '\n': + if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { + escaped.BadBIDI = true + } + lineHasBIDI = false + lineHasRTLScript = false + lineHasLTRScript = false + + case r == '\r' || r == '\t' || r == ' ': + // These are acceptable control characters and space characters + case unicode.IsSpace(r): + escaped.HasSpaces = true + escaped.Escaped = true + if writePos < i { + if _, err = output.Write(bs[writePos:i]); err != nil { + escaped.HasError = true + return + } + } + if err = writeEscaped(output, r); err != nil { + escaped.HasError = true + return + } + writePos = i + size + case unicode.Is(unicode.Bidi_Control, r): + escaped.Escaped = true + escaped.HasBIDI = true + if writePos < i { + if _, err = output.Write(bs[writePos:i]); err != nil { + escaped.HasError = true + return + } + } + lineHasBIDI = true + if err = writeEscaped(output, r); err != nil { + escaped.HasError = true + return + } + writePos = i + size + case unicode.Is(unicode.C, r): + escaped.Escaped = true + escaped.HasControls = true + if writePos < i { + if _, err = output.Write(bs[writePos:i]); err != nil { + escaped.HasError = true + return + } + } + if err = writeEscaped(output, r); err != nil { + escaped.HasError = true + return + } + writePos = i + size + case unicode.Is(unicode.M, r): + escaped.Escaped = true + escaped.HasMarks = true + if writePos < i { + if _, err = output.Write(bs[writePos:i]); err != nil { + escaped.HasError = true + return + } + } + if err = writeEscaped(output, r); err != nil { + escaped.HasError = true + return + } + writePos = i + size + default: + p, _ := bidi.Lookup(bs[i : i+size]) + c := p.Class() + if c == bidi.R || c == bidi.AL { + lineHasRTLScript = true + escaped.HasRTLScript = true + } else if c == bidi.L { + lineHasLTRScript = true + escaped.HasLTRScript = true + } + } + i += size + } + if n > 0 { + // we read something... + // write everything unwritten + if writePos < i { + if _, err = output.Write(bs[writePos:i]); err != nil { + escaped.HasError = true + return + } + } + + // reset the starting positions for the next read + readStart = 0 + writePos = 0 + } + } + if readStart > 0 { + // this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round + escaped.Escaped = true + escaped.HasBadRunes = true + if err = writeBroken(output, buf[:readStart]); err != nil { + escaped.HasError = true + return + } + } + if err == io.EOF { + if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { + escaped.BadBIDI = true + } + err = nil + return + } + escaped.HasError = true + return +} + +func writeBroken(output io.Writer, bs []byte) (err error) { + _, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs) + return +} + +func writeEscaped(output io.Writer, r rune) (err error) { + _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r) + return +} |