aboutsummaryrefslogtreecommitdiffstats
path: root/modules/indexer
diff options
context:
space:
mode:
authorsilverwind <me@silverwind.io>2024-01-27 19:02:51 +0100
committerGitHub <noreply@github.com>2024-01-27 18:02:51 +0000
commit60e4a98ab07dcf3bd86cf630c79e6433c3ef3e84 (patch)
tree6ccd4d32471410f2b20d3c853dc45268cc2a5e07 /modules/indexer
parent0e650dca3076bbf8e1a4d1a80cef3275a51af658 (diff)
downloadgitea-60e4a98ab07dcf3bd86cf630c79e6433c3ef3e84.tar.gz
gitea-60e4a98ab07dcf3bd86cf630c79e6433c3ef3e84.zip
Preserve BOM in web editor (#28935)
The `ToUTF8*` functions were stripping BOM, while BOM is actually valid in UTF8, so the stripping must be optional depending on use case. This does: - Add a options struct to all `ToUTF8*` functions, that by default will strip BOM to preserve existing behaviour - Remove `ToUTF8` function, it was dead code - Rename `ToUTF8WithErr` to `ToUTF8` - Preserve BOM in Monaco Editor - Remove a unnecessary newline in the textarea value. Browsers did ignore it, it seems but it's better not to rely on this behaviour. Fixes: https://github.com/go-gitea/gitea/issues/28743 Related: https://github.com/go-gitea/gitea/issues/6716 which seems to have once introduced a mechanism that strips and re-adds the BOM, but from what I can tell, this mechanism was removed at some point after that PR.
Diffstat (limited to 'modules/indexer')
-rw-r--r--modules/indexer/code/bleve/bleve.go2
-rw-r--r--modules/indexer/code/elasticsearch/elasticsearch.go2
2 files changed, 2 insertions, 2 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index 0bfd85cb3f..8ba50ed77c 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -174,7 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
- Content: string(charset.ToUTF8DropErrors(fileContents)),
+ Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go
index e7e3429a39..2fadbfeb06 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -135,7 +135,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
Id(id).
Doc(map[string]any{
"repo_id": repo.ID,
- "content": string(charset.ToUTF8DropErrors(fileContents)),
+ "content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
"updated_at": timeutil.TimeStampNow(),