diff options
author | wxiaoguang <wxiaoguang@gmail.com> | 2023-12-17 22:38:54 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-12-17 14:38:54 +0000 |
commit | 20929edc9962281e35a81756d76dd1caa5741ff8 (patch) | |
tree | 138bbb9c97e609136fe83cf6e5524949218d1e72 /modules | |
parent | 408a4842240e7dd906e682196bd4254d6c76fcb9 (diff) | |
download | gitea-20929edc9962281e35a81756d76dd1caa5741ff8.tar.gz gitea-20929edc9962281e35a81756d76dd1caa5741ff8.zip |
Add option to disable ambiguous unicode characters detection (#28454)
* Close #24483
* Close #28123
* Close #23682
* Close #23149
(maybe more)
Diffstat (limited to 'modules')
-rw-r--r-- | modules/charset/escape.go | 59 | ||||
-rw-r--r-- | modules/charset/escape_stream.go | 2 | ||||
-rw-r--r-- | modules/charset/escape_test.go | 52 | ||||
-rw-r--r-- | modules/git/command.go | 11 | ||||
-rw-r--r-- | modules/highlight/highlight.go | 29 | ||||
-rw-r--r-- | modules/highlight/highlight_test.go | 32 | ||||
-rw-r--r-- | modules/indexer/code/search.go | 3 | ||||
-rw-r--r-- | modules/markup/orgmode/orgmode.go | 2 | ||||
-rw-r--r-- | modules/setting/ui.go | 5 | ||||
-rw-r--r-- | modules/util/string.go | 14 |
10 files changed, 85 insertions, 124 deletions
diff --git a/modules/charset/escape.go b/modules/charset/escape.go index 5608836a45..92e417d1f7 100644 --- a/modules/charset/escape.go +++ b/modules/charset/escape.go @@ -8,11 +8,12 @@ package charset import ( - "bufio" + "html/template" "io" "strings" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/translation" ) @@ -20,20 +21,18 @@ import ( const RuneNBSP = 0xa0 // EscapeControlHTML escapes the unicode control sequences in a provided html document -func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) { +func EscapeControlHTML(html template.HTML, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output template.HTML) { sb := &strings.Builder{} - outputStream := &HTMLStreamerWriter{Writer: sb} - streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) - - if err := StreamHTML(strings.NewReader(text), streamer); err != nil { - streamer.escaped.HasError = true - log.Error("Error whilst escaping: %v", err) - } - return streamer.escaped, sb.String() + escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, allowed...) // err has been handled in EscapeControlReader + return escaped, template.HTML(sb.String()) } -// EscapeControlReaders escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte +// EscapeControlReader escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) { + if !setting.UI.AmbiguousUnicodeDetection { + _, err = io.Copy(writer, reader) + return &EscapeStatus{}, err + } outputStream := &HTMLStreamerWriter{Writer: writer} streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) @@ -43,41 +42,3 @@ func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation. } return streamer.escaped, err } - -// EscapeControlStringReader escapes the unicode control sequences in a provided reader of string content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte. HTML line breaks are not inserted after every newline by this method. -func EscapeControlStringReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) { - bufRd := bufio.NewReader(reader) - outputStream := &HTMLStreamerWriter{Writer: writer} - streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) - - for { - line, rdErr := bufRd.ReadString('\n') - if len(line) > 0 { - if err := streamer.Text(line); err != nil { - streamer.escaped.HasError = true - log.Error("Error whilst escaping: %v", err) - return streamer.escaped, err - } - } - if rdErr != nil { - if rdErr != io.EOF { - err = rdErr - } - break - } - } - return streamer.escaped, err -} - -// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string -func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) { - sb := &strings.Builder{} - outputStream := &HTMLStreamerWriter{Writer: sb} - streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer) - - if err := streamer.Text(text); err != nil { - streamer.escaped.HasError = true - log.Error("Error whilst escaping: %v", err) - } - return streamer.escaped, sb.String() -} diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go index 03d4cfc0c1..3f08fd94a4 100644 --- a/modules/charset/escape_stream.go +++ b/modules/charset/escape_stream.go @@ -64,7 +64,7 @@ func (e *escapeStreamer) Text(data string) error { until, next = nextIdxs[0]+pos, nextIdxs[1]+pos } - // from pos until until we know that the runes are not \r\t\n or even ' ' + // from pos until we know that the runes are not \r\t\n or even ' ' runes := make([]rune, 0, next-until) positions := make([]int, 0, next-until+1) diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go index f63c5c5c52..a353ced631 100644 --- a/modules/charset/escape_test.go +++ b/modules/charset/escape_test.go @@ -4,11 +4,14 @@ package charset import ( - "reflect" "strings" "testing" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" "code.gitea.io/gitea/modules/translation" + + "github.com/stretchr/testify/assert" ) type escapeControlTest struct { @@ -132,22 +135,8 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`, }, } -func TestEscapeControlString(t *testing.T) { - for _, tt := range escapeControlTests { - t.Run(tt.name, func(t *testing.T) { - status, result := EscapeControlString(tt.text, &translation.MockLocale{}) - if !reflect.DeepEqual(*status, tt.status) { - t.Errorf("EscapeControlString() status = %v, wanted= %v", status, tt.status) - } - if result != tt.result { - t.Errorf("EscapeControlString()\nresult= %v,\nwanted= %v", result, tt.result) - } - }) - } -} - func TestEscapeControlReader(t *testing.T) { - // lets add some control characters to the tests + // add some control characters to the tests tests := make([]escapeControlTest, 0, len(escapeControlTests)*3) copy(tests, escapeControlTests) @@ -169,29 +158,20 @@ func TestEscapeControlReader(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - input := strings.NewReader(tt.text) output := &strings.Builder{} - status, err := EscapeControlReader(input, output, &translation.MockLocale{}) - result := output.String() - if err != nil { - t.Errorf("EscapeControlReader(): err = %v", err) - } - - if !reflect.DeepEqual(*status, tt.status) { - t.Errorf("EscapeControlReader() status = %v, wanted= %v", status, tt.status) - } - if result != tt.result { - t.Errorf("EscapeControlReader()\nresult= %v,\nwanted= %v", result, tt.result) - } + status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{}) + assert.NoError(t, err) + assert.Equal(t, tt.status, *status) + assert.Equal(t, tt.result, output.String()) }) } } -func TestEscapeControlReader_panic(t *testing.T) { - bs := make([]byte, 0, 20479) - bs = append(bs, 'A') - for i := 0; i < 6826; i++ { - bs = append(bs, []byte("—")...) - } - _, _ = EscapeControlString(string(bs), &translation.MockLocale{}) +func TestSettingAmbiguousUnicodeDetection(t *testing.T) { + defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)() + _, out := EscapeControlHTML("a test", &translation.MockLocale{}) + assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out) + setting.UI.AmbiguousUnicodeDetection = false + _, out = EscapeControlHTML("a test", &translation.MockLocale{}) + assert.EqualValues(t, `a test`, out) } diff --git a/modules/git/command.go b/modules/git/command.go index f095bb18be..9305ef6f92 100644 --- a/modules/git/command.go +++ b/modules/git/command.go @@ -14,7 +14,6 @@ import ( "os/exec" "strings" "time" - "unsafe" "code.gitea.io/gitea/modules/git/internal" //nolint:depguard // only this file can use the internal type CmdArg, other files and packages should use AddXxx functions "code.gitea.io/gitea/modules/log" @@ -389,15 +388,11 @@ func (r *runStdError) IsExitCode(code int) bool { return false } -func bytesToString(b []byte) string { - return *(*string)(unsafe.Pointer(&b)) // that's what Golang's strings.Builder.String() does (go/src/strings/builder.go) -} - // RunStdString runs the command with options and returns stdout/stderr as string. and store stderr to returned error (err combined with stderr). func (c *Command) RunStdString(opts *RunOpts) (stdout, stderr string, runErr RunStdError) { stdoutBytes, stderrBytes, err := c.RunStdBytes(opts) - stdout = bytesToString(stdoutBytes) - stderr = bytesToString(stderrBytes) + stdout = util.UnsafeBytesToString(stdoutBytes) + stderr = util.UnsafeBytesToString(stderrBytes) if err != nil { return stdout, stderr, &runStdError{err: err, stderr: stderr} } @@ -432,7 +427,7 @@ func (c *Command) RunStdBytes(opts *RunOpts) (stdout, stderr []byte, runErr RunS err := c.Run(newOpts) stderr = stderrBuf.Bytes() if err != nil { - return nil, stderr, &runStdError{err: err, stderr: bytesToString(stderr)} + return nil, stderr, &runStdError{err: err, stderr: util.UnsafeBytesToString(stderr)} } // even if there is no err, there could still be some stderr output return stdoutBuf.Bytes(), stderr, nil diff --git a/modules/highlight/highlight.go b/modules/highlight/highlight.go index a67217e864..d7ab3f7afd 100644 --- a/modules/highlight/highlight.go +++ b/modules/highlight/highlight.go @@ -9,6 +9,7 @@ import ( "bytes" "fmt" gohtml "html" + "html/template" "io" "path/filepath" "strings" @@ -55,7 +56,7 @@ func NewContext() { } // Code returns a HTML version of code string with chroma syntax highlighting classes and the matched lexer name -func Code(fileName, language, code string) (string, string) { +func Code(fileName, language, code string) (output template.HTML, lexerName string) { NewContext() // diff view newline will be passed as empty, change to literal '\n' so it can be copied @@ -65,7 +66,7 @@ func Code(fileName, language, code string) (string, string) { } if len(code) > sizeLimit { - return code, "" + return template.HTML(template.HTMLEscapeString(code)), "" } var lexer chroma.Lexer @@ -102,13 +103,11 @@ func Code(fileName, language, code string) (string, string) { cache.Add(fileName, lexer) } - lexerName := formatLexerName(lexer.Config().Name) - - return CodeFromLexer(lexer, code), lexerName + return CodeFromLexer(lexer, code), formatLexerName(lexer.Config().Name) } // CodeFromLexer returns a HTML version of code string with chroma syntax highlighting classes -func CodeFromLexer(lexer chroma.Lexer, code string) string { +func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML { formatter := html.New(html.WithClasses(true), html.WithLineNumbers(false), html.PreventSurroundingPre(true), @@ -120,23 +119,23 @@ func CodeFromLexer(lexer chroma.Lexer, code string) string { iterator, err := lexer.Tokenise(nil, code) if err != nil { log.Error("Can't tokenize code: %v", err) - return code + return template.HTML(template.HTMLEscapeString(code)) } // style not used for live site but need to pass something err = formatter.Format(htmlw, githubStyles, iterator) if err != nil { log.Error("Can't format code: %v", err) - return code + return template.HTML(template.HTMLEscapeString(code)) } _ = htmlw.Flush() // Chroma will add newlines for certain lexers in order to highlight them properly // Once highlighted, strip them here, so they don't cause copy/paste trouble in HTML output - return strings.TrimSuffix(htmlbuf.String(), "\n") + return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n")) } // File returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name -func File(fileName, language string, code []byte) ([]string, string, error) { +func File(fileName, language string, code []byte) ([]template.HTML, string, error) { NewContext() if len(code) > sizeLimit { @@ -183,14 +182,14 @@ func File(fileName, language string, code []byte) ([]string, string, error) { tokensLines := chroma.SplitTokensIntoLines(iterator.Tokens()) htmlBuf := &bytes.Buffer{} - lines := make([]string, 0, len(tokensLines)) + lines := make([]template.HTML, 0, len(tokensLines)) for _, tokens := range tokensLines { iterator = chroma.Literator(tokens...) err = formatter.Format(htmlBuf, githubStyles, iterator) if err != nil { return nil, "", fmt.Errorf("can't format code: %w", err) } - lines = append(lines, htmlBuf.String()) + lines = append(lines, template.HTML(htmlBuf.String())) htmlBuf.Reset() } @@ -198,9 +197,9 @@ func File(fileName, language string, code []byte) ([]string, string, error) { } // PlainText returns non-highlighted HTML for code -func PlainText(code []byte) []string { +func PlainText(code []byte) []template.HTML { r := bufio.NewReader(bytes.NewReader(code)) - m := make([]string, 0, bytes.Count(code, []byte{'\n'})+1) + m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1) for { content, err := r.ReadString('\n') if err != nil && err != io.EOF { @@ -210,7 +209,7 @@ func PlainText(code []byte) []string { if content == "" && err == io.EOF { break } - s := gohtml.EscapeString(content) + s := template.HTML(gohtml.EscapeString(content)) m = append(m, s) } return m diff --git a/modules/highlight/highlight_test.go b/modules/highlight/highlight_test.go index 7a9887728f..659688bd0f 100644 --- a/modules/highlight/highlight_test.go +++ b/modules/highlight/highlight_test.go @@ -4,21 +4,36 @@ package highlight import ( + "html/template" "strings" "testing" "github.com/stretchr/testify/assert" ) -func lines(s string) []string { - return strings.Split(strings.ReplaceAll(strings.TrimSpace(s), `\n`, "\n"), "\n") +func lines(s string) (out []template.HTML) { + // "" => [], "a" => ["a"], "a\n" => ["a\n"], "a\nb" => ["a\n", "b"] (each line always includes EOL "\n" if it exists) + out = make([]template.HTML, 0) + s = strings.ReplaceAll(strings.ReplaceAll(strings.TrimSpace(s), "\n", ""), `\n`, "\n") + for { + if p := strings.IndexByte(s, '\n'); p != -1 { + out = append(out, template.HTML(s[:p+1])) + s = s[p+1:] + } else { + break + } + } + if s != "" { + out = append(out, template.HTML(s)) + } + return out } func TestFile(t *testing.T) { tests := []struct { name string code string - want []string + want []template.HTML lexerName string }{ { @@ -99,10 +114,7 @@ c=2 t.Run(tt.name, func(t *testing.T) { out, lexerName, err := File(tt.name, "", []byte(tt.code)) assert.NoError(t, err) - expected := strings.Join(tt.want, "\n") - actual := strings.Join(out, "\n") - assert.Equal(t, strings.Count(actual, "<span"), strings.Count(actual, "</span>")) - assert.EqualValues(t, expected, actual) + assert.EqualValues(t, tt.want, out) assert.Equal(t, tt.lexerName, lexerName) }) } @@ -112,7 +124,7 @@ func TestPlainText(t *testing.T) { tests := []struct { name string code string - want []string + want []template.HTML }{ { name: "empty.py", @@ -165,9 +177,7 @@ c=2`), for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { out := PlainText([]byte(tt.code)) - expected := strings.Join(tt.want, "\n") - actual := strings.Join(out, "\n") - assert.EqualValues(t, expected, actual) + assert.EqualValues(t, tt.want, out) }) } } diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go index fdb468df1a..e19e22eea0 100644 --- a/modules/indexer/code/search.go +++ b/modules/indexer/code/search.go @@ -6,6 +6,7 @@ package code import ( "bytes" "context" + "html/template" "strings" "code.gitea.io/gitea/modules/highlight" @@ -22,7 +23,7 @@ type Result struct { Language string Color string LineNumbers []int - FormattedLines string + FormattedLines template.HTML } type SearchResultLanguages = internal.SearchResultLanguages diff --git a/modules/markup/orgmode/orgmode.go b/modules/markup/orgmode/orgmode.go index c1e0144199..e7af02b496 100644 --- a/modules/markup/orgmode/orgmode.go +++ b/modules/markup/orgmode/orgmode.go @@ -87,7 +87,7 @@ func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error } lexer = chroma.Coalesce(lexer) - if _, err := w.WriteString(highlight.CodeFromLexer(lexer, source)); err != nil { + if _, err := w.WriteString(string(highlight.CodeFromLexer(lexer, source))); err != nil { return "" } } diff --git a/modules/setting/ui.go b/modules/setting/ui.go index 31042d3ee0..f94e6206cd 100644 --- a/modules/setting/ui.go +++ b/modules/setting/ui.go @@ -35,6 +35,8 @@ var UI = struct { OnlyShowRelevantRepos bool ExploreDefaultSort string `ini:"EXPLORE_PAGING_DEFAULT_SORT"` + AmbiguousUnicodeDetection bool + Notification struct { MinTimeout time.Duration TimeoutStep time.Duration @@ -82,6 +84,9 @@ var UI = struct { Reactions: []string{`+1`, `-1`, `laugh`, `hooray`, `confused`, `heart`, `rocket`, `eyes`}, CustomEmojis: []string{`git`, `gitea`, `codeberg`, `gitlab`, `github`, `gogs`}, CustomEmojisMap: map[string]string{"git": ":git:", "gitea": ":gitea:", "codeberg": ":codeberg:", "gitlab": ":gitlab:", "github": ":github:", "gogs": ":gogs:"}, + + AmbiguousUnicodeDetection: true, + Notification: struct { MinTimeout time.Duration TimeoutStep time.Duration diff --git a/modules/util/string.go b/modules/util/string.go index f2def7b0ec..2cf44d29b1 100644 --- a/modules/util/string.go +++ b/modules/util/string.go @@ -3,7 +3,7 @@ package util -import "github.com/yuin/goldmark/util" +import "unsafe" func isSnakeCaseUpper(c byte) bool { return 'A' <= c && c <= 'Z' @@ -83,5 +83,15 @@ func ToSnakeCase(input string) string { } } } - return util.BytesToReadOnlyString(res) + return UnsafeBytesToString(res) +} + +// UnsafeBytesToString uses Go's unsafe package to convert a byte slice to a string. +// TODO: replace all "goldmark/util.BytesToReadOnlyString" with this official approach +func UnsafeBytesToString(b []byte) string { + return unsafe.String(unsafe.SliceData(b), len(b)) +} + +func UnsafeStringToBytes(s string) []byte { + return unsafe.Slice(unsafe.StringData(s), len(s)) } |