summaryrefslogtreecommitdiffstats
path: root/modules/charset/escape.go
diff options
context:
space:
mode:
authorzeripath <art27@cantab.net>2022-01-07 01:18:52 +0000
committerGitHub <noreply@github.com>2022-01-07 02:18:52 +0100
commit21ed4fd8da4c8992518dcfb01aa7306f7406f735 (patch)
treeeb0bdaed8d06849116818f058b6120633d329d69 /modules/charset/escape.go
parentee60f27aec0f75a34ae62841ed52579c0c20dcfa (diff)
downloadgitea-21ed4fd8da4c8992518dcfb01aa7306f7406f735.tar.gz
gitea-21ed4fd8da4c8992518dcfb01aa7306f7406f735.zip
Add warning for BIDI characters in page renders and in diffs (#17562)
Fix #17514 Given the comments I've adjusted this somewhat. The numbers of characters detected are increased and include things like the use of U+300 to make à instead of à and non-breaking spaces. There is a button which can be used to escape the content to show it. Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: Gwyneth Morgan <gwymor@tilde.club> Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
Diffstat (limited to 'modules/charset/escape.go')
-rw-r--r--modules/charset/escape.go230
1 files changed, 230 insertions, 0 deletions
diff --git a/modules/charset/escape.go b/modules/charset/escape.go
new file mode 100644
index 0000000000..abe813b465
--- /dev/null
+++ b/modules/charset/escape.go
@@ -0,0 +1,230 @@
+// Copyright 2021 The Gitea Authors. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package charset
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "strings"
+ "unicode"
+ "unicode/utf8"
+
+ "golang.org/x/text/unicode/bidi"
+)
+
+// EscapeStatus represents the findings of the unicode escaper
+type EscapeStatus struct {
+ Escaped bool
+ HasError bool
+ HasBadRunes bool
+ HasControls bool
+ HasSpaces bool
+ HasMarks bool
+ HasBIDI bool
+ BadBIDI bool
+ HasRTLScript bool
+ HasLTRScript bool
+}
+
+// Or combines two EscapeStatus structs into one representing the conjunction of the two
+func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
+ st := status
+ st.Escaped = st.Escaped || other.Escaped
+ st.HasError = st.HasError || other.HasError
+ st.HasBadRunes = st.HasBadRunes || other.HasBadRunes
+ st.HasControls = st.HasControls || other.HasControls
+ st.HasSpaces = st.HasSpaces || other.HasSpaces
+ st.HasMarks = st.HasMarks || other.HasMarks
+ st.HasBIDI = st.HasBIDI || other.HasBIDI
+ st.BadBIDI = st.BadBIDI || other.BadBIDI
+ st.HasRTLScript = st.HasRTLScript || other.HasRTLScript
+ st.HasLTRScript = st.HasLTRScript || other.HasLTRScript
+ return st
+}
+
+// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
+func EscapeControlString(text string) (EscapeStatus, string) {
+ sb := &strings.Builder{}
+ escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
+ return escaped, sb.String()
+}
+
+// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
+func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
+ buf := &bytes.Buffer{}
+ escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
+ return escaped, buf.Bytes()
+}
+
+// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
+func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
+ buf := make([]byte, 4096)
+ readStart := 0
+ var n int
+ var writePos int
+
+ lineHasBIDI := false
+ lineHasRTLScript := false
+ lineHasLTRScript := false
+
+readingloop:
+ for err == nil {
+ n, err = text.Read(buf[readStart:])
+ bs := buf[:n+readStart]
+ i := 0
+
+ for i < len(bs) {
+ r, size := utf8.DecodeRune(bs[i:])
+ // Now handle the codepoints
+ switch {
+ case r == utf8.RuneError:
+ if writePos < i {
+ if _, err = output.Write(bs[writePos:i]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ writePos = i
+ }
+ // runes can be at most 4 bytes - so...
+ if len(bs)-i <= 3 {
+ // if not request more data
+ copy(buf, bs[i:])
+ readStart = n - i
+ writePos = 0
+ continue readingloop
+ }
+ // this is a real broken rune
+ escaped.HasBadRunes = true
+ escaped.Escaped = true
+ if err = writeBroken(output, bs[i:i+size]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ writePos += size
+ case r == '\n':
+ if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
+ escaped.BadBIDI = true
+ }
+ lineHasBIDI = false
+ lineHasRTLScript = false
+ lineHasLTRScript = false
+
+ case r == '\r' || r == '\t' || r == ' ':
+ // These are acceptable control characters and space characters
+ case unicode.IsSpace(r):
+ escaped.HasSpaces = true
+ escaped.Escaped = true
+ if writePos < i {
+ if _, err = output.Write(bs[writePos:i]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ }
+ if err = writeEscaped(output, r); err != nil {
+ escaped.HasError = true
+ return
+ }
+ writePos = i + size
+ case unicode.Is(unicode.Bidi_Control, r):
+ escaped.Escaped = true
+ escaped.HasBIDI = true
+ if writePos < i {
+ if _, err = output.Write(bs[writePos:i]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ }
+ lineHasBIDI = true
+ if err = writeEscaped(output, r); err != nil {
+ escaped.HasError = true
+ return
+ }
+ writePos = i + size
+ case unicode.Is(unicode.C, r):
+ escaped.Escaped = true
+ escaped.HasControls = true
+ if writePos < i {
+ if _, err = output.Write(bs[writePos:i]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ }
+ if err = writeEscaped(output, r); err != nil {
+ escaped.HasError = true
+ return
+ }
+ writePos = i + size
+ case unicode.Is(unicode.M, r):
+ escaped.Escaped = true
+ escaped.HasMarks = true
+ if writePos < i {
+ if _, err = output.Write(bs[writePos:i]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ }
+ if err = writeEscaped(output, r); err != nil {
+ escaped.HasError = true
+ return
+ }
+ writePos = i + size
+ default:
+ p, _ := bidi.Lookup(bs[i : i+size])
+ c := p.Class()
+ if c == bidi.R || c == bidi.AL {
+ lineHasRTLScript = true
+ escaped.HasRTLScript = true
+ } else if c == bidi.L {
+ lineHasLTRScript = true
+ escaped.HasLTRScript = true
+ }
+ }
+ i += size
+ }
+ if n > 0 {
+ // we read something...
+ // write everything unwritten
+ if writePos < i {
+ if _, err = output.Write(bs[writePos:i]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ }
+
+ // reset the starting positions for the next read
+ readStart = 0
+ writePos = 0
+ }
+ }
+ if readStart > 0 {
+ // this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
+ escaped.Escaped = true
+ escaped.HasBadRunes = true
+ if err = writeBroken(output, buf[:readStart]); err != nil {
+ escaped.HasError = true
+ return
+ }
+ }
+ if err == io.EOF {
+ if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
+ escaped.BadBIDI = true
+ }
+ err = nil
+ return
+ }
+ escaped.HasError = true
+ return
+}
+
+func writeBroken(output io.Writer, bs []byte) (err error) {
+ _, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, bs)
+ return
+}
+
+func writeEscaped(output io.Writer, r rune) (err error) {
+ _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
+ return
+}