diff options
author | silverwind <me@silverwind.io> | 2024-01-27 19:02:51 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-27 18:02:51 +0000 |
commit | 60e4a98ab07dcf3bd86cf630c79e6433c3ef3e84 (patch) | |
tree | 6ccd4d32471410f2b20d3c853dc45268cc2a5e07 /modules/indexer | |
parent | 0e650dca3076bbf8e1a4d1a80cef3275a51af658 (diff) | |
download | gitea-60e4a98ab07dcf3bd86cf630c79e6433c3ef3e84.tar.gz gitea-60e4a98ab07dcf3bd86cf630c79e6433c3ef3e84.zip |
Preserve BOM in web editor (#28935)
The `ToUTF8*` functions were stripping BOM, while BOM is actually valid
in UTF8, so the stripping must be optional depending on use case. This
does:
- Add a options struct to all `ToUTF8*` functions, that by default will
strip BOM to preserve existing behaviour
- Remove `ToUTF8` function, it was dead code
- Rename `ToUTF8WithErr` to `ToUTF8`
- Preserve BOM in Monaco Editor
- Remove a unnecessary newline in the textarea value. Browsers did
ignore it, it seems but it's better not to rely on this behaviour.
Fixes: https://github.com/go-gitea/gitea/issues/28743
Related: https://github.com/go-gitea/gitea/issues/6716 which seems to
have once introduced a mechanism that strips and re-adds the BOM, but
from what I can tell, this mechanism was removed at some point after
that PR.
Diffstat (limited to 'modules/indexer')
-rw-r--r-- | modules/indexer/code/bleve/bleve.go | 2 | ||||
-rw-r--r-- | modules/indexer/code/elasticsearch/elasticsearch.go | 2 |
2 files changed, 2 insertions, 2 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 0bfd85cb3f..8ba50ed77c 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -174,7 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro return batch.Index(id, &RepoIndexerData{ RepoID: repo.ID, CommitID: commitSha, - Content: string(charset.ToUTF8DropErrors(fileContents)), + Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), Language: analyze.GetCodeLanguage(update.Filename, fileContents), UpdatedAt: time.Now().UTC(), }) diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index e7e3429a39..2fadbfeb06 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -135,7 +135,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro Id(id). Doc(map[string]any{ "repo_id": repo.ID, - "content": string(charset.ToUTF8DropErrors(fileContents)), + "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})), "commit_id": sha, "language": analyze.GetCodeLanguage(update.Filename, fileContents), "updated_at": timeutil.TimeStampNow(), |