aboutsummaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
authorwxiaoguang <wxiaoguang@gmail.com>2023-12-17 22:38:54 +0800
committerGitHub <noreply@github.com>2023-12-17 14:38:54 +0000
commit20929edc9962281e35a81756d76dd1caa5741ff8 (patch)
tree138bbb9c97e609136fe83cf6e5524949218d1e72 /modules
parent408a4842240e7dd906e682196bd4254d6c76fcb9 (diff)
downloadgitea-20929edc9962281e35a81756d76dd1caa5741ff8.tar.gz
gitea-20929edc9962281e35a81756d76dd1caa5741ff8.zip
Add option to disable ambiguous unicode characters detection (#28454)
* Close #24483 * Close #28123 * Close #23682 * Close #23149 (maybe more)
Diffstat (limited to 'modules')
-rw-r--r--modules/charset/escape.go59
-rw-r--r--modules/charset/escape_stream.go2
-rw-r--r--modules/charset/escape_test.go52
-rw-r--r--modules/git/command.go11
-rw-r--r--modules/highlight/highlight.go29
-rw-r--r--modules/highlight/highlight_test.go32
-rw-r--r--modules/indexer/code/search.go3
-rw-r--r--modules/markup/orgmode/orgmode.go2
-rw-r--r--modules/setting/ui.go5
-rw-r--r--modules/util/string.go14
10 files changed, 85 insertions, 124 deletions
diff --git a/modules/charset/escape.go b/modules/charset/escape.go
index 5608836a45..92e417d1f7 100644
--- a/modules/charset/escape.go
+++ b/modules/charset/escape.go
@@ -8,11 +8,12 @@
package charset
import (
- "bufio"
+ "html/template"
"io"
"strings"
"code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/translation"
)
@@ -20,20 +21,18 @@ import (
const RuneNBSP = 0xa0
// EscapeControlHTML escapes the unicode control sequences in a provided html document
-func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
+func EscapeControlHTML(html template.HTML, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output template.HTML) {
sb := &strings.Builder{}
- outputStream := &HTMLStreamerWriter{Writer: sb}
- streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
-
- if err := StreamHTML(strings.NewReader(text), streamer); err != nil {
- streamer.escaped.HasError = true
- log.Error("Error whilst escaping: %v", err)
- }
- return streamer.escaped, sb.String()
+ escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, allowed...) // err has been handled in EscapeControlReader
+ return escaped, template.HTML(sb.String())
}
-// EscapeControlReaders escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte
+// EscapeControlReader escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
+ if !setting.UI.AmbiguousUnicodeDetection {
+ _, err = io.Copy(writer, reader)
+ return &EscapeStatus{}, err
+ }
outputStream := &HTMLStreamerWriter{Writer: writer}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
@@ -43,41 +42,3 @@ func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.
}
return streamer.escaped, err
}
-
-// EscapeControlStringReader escapes the unicode control sequences in a provided reader of string content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte. HTML line breaks are not inserted after every newline by this method.
-func EscapeControlStringReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
- bufRd := bufio.NewReader(reader)
- outputStream := &HTMLStreamerWriter{Writer: writer}
- streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
-
- for {
- line, rdErr := bufRd.ReadString('\n')
- if len(line) > 0 {
- if err := streamer.Text(line); err != nil {
- streamer.escaped.HasError = true
- log.Error("Error whilst escaping: %v", err)
- return streamer.escaped, err
- }
- }
- if rdErr != nil {
- if rdErr != io.EOF {
- err = rdErr
- }
- break
- }
- }
- return streamer.escaped, err
-}
-
-// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
-func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
- sb := &strings.Builder{}
- outputStream := &HTMLStreamerWriter{Writer: sb}
- streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
-
- if err := streamer.Text(text); err != nil {
- streamer.escaped.HasError = true
- log.Error("Error whilst escaping: %v", err)
- }
- return streamer.escaped, sb.String()
-}
diff --git a/modules/charset/escape_stream.go b/modules/charset/escape_stream.go
index 03d4cfc0c1..3f08fd94a4 100644
--- a/modules/charset/escape_stream.go
+++ b/modules/charset/escape_stream.go
@@ -64,7 +64,7 @@ func (e *escapeStreamer) Text(data string) error {
until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
}
- // from pos until until we know that the runes are not \r\t\n or even ' '
+ // from pos until we know that the runes are not \r\t\n or even ' '
runes := make([]rune, 0, next-until)
positions := make([]int, 0, next-until+1)
diff --git a/modules/charset/escape_test.go b/modules/charset/escape_test.go
index f63c5c5c52..a353ced631 100644
--- a/modules/charset/escape_test.go
+++ b/modules/charset/escape_test.go
@@ -4,11 +4,14 @@
package charset
import (
- "reflect"
"strings"
"testing"
+ "code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/test"
"code.gitea.io/gitea/modules/translation"
+
+ "github.com/stretchr/testify/assert"
)
type escapeControlTest struct {
@@ -132,22 +135,8 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`,
},
}
-func TestEscapeControlString(t *testing.T) {
- for _, tt := range escapeControlTests {
- t.Run(tt.name, func(t *testing.T) {
- status, result := EscapeControlString(tt.text, &translation.MockLocale{})
- if !reflect.DeepEqual(*status, tt.status) {
- t.Errorf("EscapeControlString() status = %v, wanted= %v", status, tt.status)
- }
- if result != tt.result {
- t.Errorf("EscapeControlString()\nresult= %v,\nwanted= %v", result, tt.result)
- }
- })
- }
-}
-
func TestEscapeControlReader(t *testing.T) {
- // lets add some control characters to the tests
+ // add some control characters to the tests
tests := make([]escapeControlTest, 0, len(escapeControlTests)*3)
copy(tests, escapeControlTests)
@@ -169,29 +158,20 @@ func TestEscapeControlReader(t *testing.T) {
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
- input := strings.NewReader(tt.text)
output := &strings.Builder{}
- status, err := EscapeControlReader(input, output, &translation.MockLocale{})
- result := output.String()
- if err != nil {
- t.Errorf("EscapeControlReader(): err = %v", err)
- }
-
- if !reflect.DeepEqual(*status, tt.status) {
- t.Errorf("EscapeControlReader() status = %v, wanted= %v", status, tt.status)
- }
- if result != tt.result {
- t.Errorf("EscapeControlReader()\nresult= %v,\nwanted= %v", result, tt.result)
- }
+ status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{})
+ assert.NoError(t, err)
+ assert.Equal(t, tt.status, *status)
+ assert.Equal(t, tt.result, output.String())
})
}
}
-func TestEscapeControlReader_panic(t *testing.T) {
- bs := make([]byte, 0, 20479)
- bs = append(bs, 'A')
- for i := 0; i < 6826; i++ {
- bs = append(bs, []byte("—")...)
- }
- _, _ = EscapeControlString(string(bs), &translation.MockLocale{})
+func TestSettingAmbiguousUnicodeDetection(t *testing.T) {
+ defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)()
+ _, out := EscapeControlHTML("a test", &translation.MockLocale{})
+ assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out)
+ setting.UI.AmbiguousUnicodeDetection = false
+ _, out = EscapeControlHTML("a test", &translation.MockLocale{})
+ assert.EqualValues(t, `a test`, out)
}
diff --git a/modules/git/command.go b/modules/git/command.go
index f095bb18be..9305ef6f92 100644
--- a/modules/git/command.go
+++ b/modules/git/command.go
@@ -14,7 +14,6 @@ import (
"os/exec"
"strings"
"time"
- "unsafe"
"code.gitea.io/gitea/modules/git/internal" //nolint:depguard // only this file can use the internal type CmdArg, other files and packages should use AddXxx functions
"code.gitea.io/gitea/modules/log"
@@ -389,15 +388,11 @@ func (r *runStdError) IsExitCode(code int) bool {
return false
}
-func bytesToString(b []byte) string {
- return *(*string)(unsafe.Pointer(&b)) // that's what Golang's strings.Builder.String() does (go/src/strings/builder.go)
-}
-
// RunStdString runs the command with options and returns stdout/stderr as string. and store stderr to returned error (err combined with stderr).
func (c *Command) RunStdString(opts *RunOpts) (stdout, stderr string, runErr RunStdError) {
stdoutBytes, stderrBytes, err := c.RunStdBytes(opts)
- stdout = bytesToString(stdoutBytes)
- stderr = bytesToString(stderrBytes)
+ stdout = util.UnsafeBytesToString(stdoutBytes)
+ stderr = util.UnsafeBytesToString(stderrBytes)
if err != nil {
return stdout, stderr, &runStdError{err: err, stderr: stderr}
}
@@ -432,7 +427,7 @@ func (c *Command) RunStdBytes(opts *RunOpts) (stdout, stderr []byte, runErr RunS
err := c.Run(newOpts)
stderr = stderrBuf.Bytes()
if err != nil {
- return nil, stderr, &runStdError{err: err, stderr: bytesToString(stderr)}
+ return nil, stderr, &runStdError{err: err, stderr: util.UnsafeBytesToString(stderr)}
}
// even if there is no err, there could still be some stderr output
return stdoutBuf.Bytes(), stderr, nil
diff --git a/modules/highlight/highlight.go b/modules/highlight/highlight.go
index a67217e864..d7ab3f7afd 100644
--- a/modules/highlight/highlight.go
+++ b/modules/highlight/highlight.go
@@ -9,6 +9,7 @@ import (
"bytes"
"fmt"
gohtml "html"
+ "html/template"
"io"
"path/filepath"
"strings"
@@ -55,7 +56,7 @@ func NewContext() {
}
// Code returns a HTML version of code string with chroma syntax highlighting classes and the matched lexer name
-func Code(fileName, language, code string) (string, string) {
+func Code(fileName, language, code string) (output template.HTML, lexerName string) {
NewContext()
// diff view newline will be passed as empty, change to literal '\n' so it can be copied
@@ -65,7 +66,7 @@ func Code(fileName, language, code string) (string, string) {
}
if len(code) > sizeLimit {
- return code, ""
+ return template.HTML(template.HTMLEscapeString(code)), ""
}
var lexer chroma.Lexer
@@ -102,13 +103,11 @@ func Code(fileName, language, code string) (string, string) {
cache.Add(fileName, lexer)
}
- lexerName := formatLexerName(lexer.Config().Name)
-
- return CodeFromLexer(lexer, code), lexerName
+ return CodeFromLexer(lexer, code), formatLexerName(lexer.Config().Name)
}
// CodeFromLexer returns a HTML version of code string with chroma syntax highlighting classes
-func CodeFromLexer(lexer chroma.Lexer, code string) string {
+func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML {
formatter := html.New(html.WithClasses(true),
html.WithLineNumbers(false),
html.PreventSurroundingPre(true),
@@ -120,23 +119,23 @@ func CodeFromLexer(lexer chroma.Lexer, code string) string {
iterator, err := lexer.Tokenise(nil, code)
if err != nil {
log.Error("Can't tokenize code: %v", err)
- return code
+ return template.HTML(template.HTMLEscapeString(code))
}
// style not used for live site but need to pass something
err = formatter.Format(htmlw, githubStyles, iterator)
if err != nil {
log.Error("Can't format code: %v", err)
- return code
+ return template.HTML(template.HTMLEscapeString(code))
}
_ = htmlw.Flush()
// Chroma will add newlines for certain lexers in order to highlight them properly
// Once highlighted, strip them here, so they don't cause copy/paste trouble in HTML output
- return strings.TrimSuffix(htmlbuf.String(), "\n")
+ return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n"))
}
// File returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
-func File(fileName, language string, code []byte) ([]string, string, error) {
+func File(fileName, language string, code []byte) ([]template.HTML, string, error) {
NewContext()
if len(code) > sizeLimit {
@@ -183,14 +182,14 @@ func File(fileName, language string, code []byte) ([]string, string, error) {
tokensLines := chroma.SplitTokensIntoLines(iterator.Tokens())
htmlBuf := &bytes.Buffer{}
- lines := make([]string, 0, len(tokensLines))
+ lines := make([]template.HTML, 0, len(tokensLines))
for _, tokens := range tokensLines {
iterator = chroma.Literator(tokens...)
err = formatter.Format(htmlBuf, githubStyles, iterator)
if err != nil {
return nil, "", fmt.Errorf("can't format code: %w", err)
}
- lines = append(lines, htmlBuf.String())
+ lines = append(lines, template.HTML(htmlBuf.String()))
htmlBuf.Reset()
}
@@ -198,9 +197,9 @@ func File(fileName, language string, code []byte) ([]string, string, error) {
}
// PlainText returns non-highlighted HTML for code
-func PlainText(code []byte) []string {
+func PlainText(code []byte) []template.HTML {
r := bufio.NewReader(bytes.NewReader(code))
- m := make([]string, 0, bytes.Count(code, []byte{'\n'})+1)
+ m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
for {
content, err := r.ReadString('\n')
if err != nil && err != io.EOF {
@@ -210,7 +209,7 @@ func PlainText(code []byte) []string {
if content == "" && err == io.EOF {
break
}
- s := gohtml.EscapeString(content)
+ s := template.HTML(gohtml.EscapeString(content))
m = append(m, s)
}
return m
diff --git a/modules/highlight/highlight_test.go b/modules/highlight/highlight_test.go
index 7a9887728f..659688bd0f 100644
--- a/modules/highlight/highlight_test.go
+++ b/modules/highlight/highlight_test.go
@@ -4,21 +4,36 @@
package highlight
import (
+ "html/template"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
-func lines(s string) []string {
- return strings.Split(strings.ReplaceAll(strings.TrimSpace(s), `\n`, "\n"), "\n")
+func lines(s string) (out []template.HTML) {
+ // "" => [], "a" => ["a"], "a\n" => ["a\n"], "a\nb" => ["a\n", "b"] (each line always includes EOL "\n" if it exists)
+ out = make([]template.HTML, 0)
+ s = strings.ReplaceAll(strings.ReplaceAll(strings.TrimSpace(s), "\n", ""), `\n`, "\n")
+ for {
+ if p := strings.IndexByte(s, '\n'); p != -1 {
+ out = append(out, template.HTML(s[:p+1]))
+ s = s[p+1:]
+ } else {
+ break
+ }
+ }
+ if s != "" {
+ out = append(out, template.HTML(s))
+ }
+ return out
}
func TestFile(t *testing.T) {
tests := []struct {
name string
code string
- want []string
+ want []template.HTML
lexerName string
}{
{
@@ -99,10 +114,7 @@ c=2
t.Run(tt.name, func(t *testing.T) {
out, lexerName, err := File(tt.name, "", []byte(tt.code))
assert.NoError(t, err)
- expected := strings.Join(tt.want, "\n")
- actual := strings.Join(out, "\n")
- assert.Equal(t, strings.Count(actual, "<span"), strings.Count(actual, "</span>"))
- assert.EqualValues(t, expected, actual)
+ assert.EqualValues(t, tt.want, out)
assert.Equal(t, tt.lexerName, lexerName)
})
}
@@ -112,7 +124,7 @@ func TestPlainText(t *testing.T) {
tests := []struct {
name string
code string
- want []string
+ want []template.HTML
}{
{
name: "empty.py",
@@ -165,9 +177,7 @@ c=2`),
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
out := PlainText([]byte(tt.code))
- expected := strings.Join(tt.want, "\n")
- actual := strings.Join(out, "\n")
- assert.EqualValues(t, expected, actual)
+ assert.EqualValues(t, tt.want, out)
})
}
}
diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go
index fdb468df1a..e19e22eea0 100644
--- a/modules/indexer/code/search.go
+++ b/modules/indexer/code/search.go
@@ -6,6 +6,7 @@ package code
import (
"bytes"
"context"
+ "html/template"
"strings"
"code.gitea.io/gitea/modules/highlight"
@@ -22,7 +23,7 @@ type Result struct {
Language string
Color string
LineNumbers []int
- FormattedLines string
+ FormattedLines template.HTML
}
type SearchResultLanguages = internal.SearchResultLanguages
diff --git a/modules/markup/orgmode/orgmode.go b/modules/markup/orgmode/orgmode.go
index c1e0144199..e7af02b496 100644
--- a/modules/markup/orgmode/orgmode.go
+++ b/modules/markup/orgmode/orgmode.go
@@ -87,7 +87,7 @@ func Render(ctx *markup.RenderContext, input io.Reader, output io.Writer) error
}
lexer = chroma.Coalesce(lexer)
- if _, err := w.WriteString(highlight.CodeFromLexer(lexer, source)); err != nil {
+ if _, err := w.WriteString(string(highlight.CodeFromLexer(lexer, source))); err != nil {
return ""
}
}
diff --git a/modules/setting/ui.go b/modules/setting/ui.go
index 31042d3ee0..f94e6206cd 100644
--- a/modules/setting/ui.go
+++ b/modules/setting/ui.go
@@ -35,6 +35,8 @@ var UI = struct {
OnlyShowRelevantRepos bool
ExploreDefaultSort string `ini:"EXPLORE_PAGING_DEFAULT_SORT"`
+ AmbiguousUnicodeDetection bool
+
Notification struct {
MinTimeout time.Duration
TimeoutStep time.Duration
@@ -82,6 +84,9 @@ var UI = struct {
Reactions: []string{`+1`, `-1`, `laugh`, `hooray`, `confused`, `heart`, `rocket`, `eyes`},
CustomEmojis: []string{`git`, `gitea`, `codeberg`, `gitlab`, `github`, `gogs`},
CustomEmojisMap: map[string]string{"git": ":git:", "gitea": ":gitea:", "codeberg": ":codeberg:", "gitlab": ":gitlab:", "github": ":github:", "gogs": ":gogs:"},
+
+ AmbiguousUnicodeDetection: true,
+
Notification: struct {
MinTimeout time.Duration
TimeoutStep time.Duration
diff --git a/modules/util/string.go b/modules/util/string.go
index f2def7b0ec..2cf44d29b1 100644
--- a/modules/util/string.go
+++ b/modules/util/string.go
@@ -3,7 +3,7 @@
package util
-import "github.com/yuin/goldmark/util"
+import "unsafe"
func isSnakeCaseUpper(c byte) bool {
return 'A' <= c && c <= 'Z'
@@ -83,5 +83,15 @@ func ToSnakeCase(input string) string {
}
}
}
- return util.BytesToReadOnlyString(res)
+ return UnsafeBytesToString(res)
+}
+
+// UnsafeBytesToString uses Go's unsafe package to convert a byte slice to a string.
+// TODO: replace all "goldmark/util.BytesToReadOnlyString" with this official approach
+func UnsafeBytesToString(b []byte) string {
+ return unsafe.String(unsafe.SliceData(b), len(b))
+}
+
+func UnsafeStringToBytes(s string) []byte {
+ return unsafe.Slice(unsafe.StringData(s), len(s))
}