diff options
Diffstat (limited to 'modules/charset/escape.go')
-rw-r--r-- | modules/charset/escape.go | 250 |
1 files changed, 36 insertions, 214 deletions
diff --git a/modules/charset/escape.go b/modules/charset/escape.go index 9c1baafba3..b264a569ff 100644 --- a/modules/charset/escape.go +++ b/modules/charset/escape.go @@ -1,236 +1,58 @@ -// Copyright 2021 The Gitea Authors. All rights reserved. +// Copyright 2022 The Gitea Authors. All rights reserved. // Use of this source code is governed by a MIT-style // license that can be found in the LICENSE file. +//go:generate go run invisible/generate.go -v -o ./invisible_gen.go + +//go:generate go run ambiguous/generate.go -v -o ./ambiguous_gen.go ambiguous/ambiguous.json + package charset import ( - "bytes" - "fmt" "io" "strings" - "unicode" - "unicode/utf8" - "golang.org/x/text/unicode/bidi" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/translation" ) -// EscapeStatus represents the findings of the unicode escaper -type EscapeStatus struct { - Escaped bool - HasError bool - HasBadRunes bool - HasControls bool - HasSpaces bool - HasMarks bool - HasBIDI bool - BadBIDI bool - HasRTLScript bool - HasLTRScript bool -} - -// Or combines two EscapeStatus structs into one representing the conjunction of the two -func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus { - st := status - st.Escaped = st.Escaped || other.Escaped - st.HasError = st.HasError || other.HasError - st.HasBadRunes = st.HasBadRunes || other.HasBadRunes - st.HasControls = st.HasControls || other.HasControls - st.HasSpaces = st.HasSpaces || other.HasSpaces - st.HasMarks = st.HasMarks || other.HasMarks - st.HasBIDI = st.HasBIDI || other.HasBIDI - st.BadBIDI = st.BadBIDI || other.BadBIDI - st.HasRTLScript = st.HasRTLScript || other.HasRTLScript - st.HasLTRScript = st.HasLTRScript || other.HasLTRScript - return st -} +// RuneNBSP is the codepoint for NBSP +const RuneNBSP = 0xa0 -// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string -func EscapeControlString(text string) (EscapeStatus, string) { +// EscapeControlHTML escapes the unicode control sequences in a provided html document +func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) { sb := &strings.Builder{} - escaped, _ := EscapeControlReader(strings.NewReader(text), sb) - return escaped, sb.String() -} + outputStream := &HTMLStreamerWriter{Writer: sb} + streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) -// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte -func EscapeControlBytes(text []byte) (EscapeStatus, []byte) { - buf := &bytes.Buffer{} - escaped, _ := EscapeControlReader(bytes.NewReader(text), buf) - return escaped, buf.Bytes() + if err := StreamHTML(strings.NewReader(text), streamer); err != nil { + streamer.escaped.HasError = true + log.Error("Error whilst escaping: %v", err) + } + return streamer.escaped, sb.String() } -// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error -func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) { - buf := make([]byte, 4096) - readStart := 0 - runeCount := 0 - var n int - var writePos int - - lineHasBIDI := false - lineHasRTLScript := false - lineHasLTRScript := false - -readingloop: - for err == nil { - n, err = text.Read(buf[readStart:]) - bs := buf[:n+readStart] - n = len(bs) - i := 0 +// EscapeControlReaders escapes the unicode control sequences in a provider reader and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte +func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) { + outputStream := &HTMLStreamerWriter{Writer: writer} + streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) - for i < len(bs) { - r, size := utf8.DecodeRune(bs[i:]) - runeCount++ - - // Now handle the codepoints - switch { - case r == utf8.RuneError: - if writePos < i { - if _, err = output.Write(bs[writePos:i]); err != nil { - escaped.HasError = true - return - } - writePos = i - } - // runes can be at most 4 bytes - so... - if len(bs)-i <= 3 { - // if not request more data - copy(buf, bs[i:]) - readStart = n - i - writePos = 0 - continue readingloop - } - // this is a real broken rune - escaped.HasBadRunes = true - escaped.Escaped = true - if err = writeBroken(output, bs[i:i+size]); err != nil { - escaped.HasError = true - return - } - writePos += size - case r == '\n': - if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { - escaped.BadBIDI = true - } - lineHasBIDI = false - lineHasRTLScript = false - lineHasLTRScript = false - - case runeCount == 1 && r == 0xFEFF: // UTF BOM - // the first BOM is safe - case r == '\r' || r == '\t' || r == ' ': - // These are acceptable control characters and space characters - case unicode.IsSpace(r): - escaped.HasSpaces = true - escaped.Escaped = true - if writePos < i { - if _, err = output.Write(bs[writePos:i]); err != nil { - escaped.HasError = true - return - } - } - if err = writeEscaped(output, r); err != nil { - escaped.HasError = true - return - } - writePos = i + size - case unicode.Is(unicode.Bidi_Control, r): - escaped.Escaped = true - escaped.HasBIDI = true - if writePos < i { - if _, err = output.Write(bs[writePos:i]); err != nil { - escaped.HasError = true - return - } - } - lineHasBIDI = true - if err = writeEscaped(output, r); err != nil { - escaped.HasError = true - return - } - writePos = i + size - case unicode.Is(unicode.C, r): - escaped.Escaped = true - escaped.HasControls = true - if writePos < i { - if _, err = output.Write(bs[writePos:i]); err != nil { - escaped.HasError = true - return - } - } - if err = writeEscaped(output, r); err != nil { - escaped.HasError = true - return - } - writePos = i + size - case unicode.Is(unicode.M, r): - escaped.Escaped = true - escaped.HasMarks = true - if writePos < i { - if _, err = output.Write(bs[writePos:i]); err != nil { - escaped.HasError = true - return - } - } - if err = writeEscaped(output, r); err != nil { - escaped.HasError = true - return - } - writePos = i + size - default: - p, _ := bidi.Lookup(bs[i : i+size]) - c := p.Class() - if c == bidi.R || c == bidi.AL { - lineHasRTLScript = true - escaped.HasRTLScript = true - } else if c == bidi.L { - lineHasLTRScript = true - escaped.HasLTRScript = true - } - } - i += size - } - if n > 0 { - // we read something... - // write everything unwritten - if writePos < i { - if _, err = output.Write(bs[writePos:i]); err != nil { - escaped.HasError = true - return - } - } - - // reset the starting positions for the next read - readStart = 0 - writePos = 0 - } - } - if readStart > 0 { - // this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round - escaped.Escaped = true - escaped.HasBadRunes = true - if err = writeBroken(output, buf[:readStart]); err != nil { - escaped.HasError = true - return - } - } - if err == io.EOF { - if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript { - escaped.BadBIDI = true - } - err = nil - return + if err = StreamHTML(reader, streamer); err != nil { + streamer.escaped.HasError = true + log.Error("Error whilst escaping: %v", err) } - escaped.HasError = true - return escaped, err + return streamer.escaped, err } -func writeBroken(output io.Writer, bs []byte) (err error) { - _, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs) - return err -} +// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string +func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) { + sb := &strings.Builder{} + outputStream := &HTMLStreamerWriter{Writer: sb} + streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) -func writeEscaped(output io.Writer, r rune) (err error) { - _, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r) - return err + if err := streamer.Text(text); err != nil { + streamer.escaped.HasError = true + log.Error("Error whilst escaping: %v", err) + } + return streamer.escaped, sb.String() } |