aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorwxiaoguang <wxiaoguang@gmail.com>2025-01-03 05:45:14 +0800
committerGitHub <noreply@github.com>2025-01-02 21:45:14 +0000
commite10d222434d441d7847716eaea95577a48e31ca3 (patch)
treea340c6882c69f69fb3d511b7a76133e50ac8b75a
parent7a35f90b2971a7d569af14a6426db5b1c6a74c1f (diff)
downloadgitea-e10d222434d441d7847716eaea95577a48e31ca3.tar.gz
gitea-e10d222434d441d7847716eaea95577a48e31ca3.zip
Fix bleve fuzziness search (#33078) (#33087)
-rw-r--r--custom/conf/app.example.ini4
-rw-r--r--modules/indexer/code/indexer.go3
-rw-r--r--modules/indexer/code/indexer_test.go4
-rw-r--r--modules/indexer/internal/bleve/util.go9
-rw-r--r--modules/indexer/internal/bleve/util_test.go7
-rw-r--r--modules/setting/indexer.go3
-rw-r--r--routers/common/codesearch.go39
-rw-r--r--routers/web/explore/code.go21
-rw-r--r--routers/web/repo/search.go26
-rw-r--r--routers/web/user/code.go22
-rw-r--r--templates/shared/search/code/search.tmpl3
11 files changed, 86 insertions, 55 deletions
diff --git a/custom/conf/app.example.ini b/custom/conf/app.example.ini
index 6377ebf9d2..6896b073e1 100644
--- a/custom/conf/app.example.ini
+++ b/custom/conf/app.example.ini
@@ -1482,6 +1482,10 @@ LEVEL = Info
;REPO_INDEXER_EXCLUDE =
;;
;MAX_FILE_SIZE = 1048576
+;;
+;; Bleve engine has performance problems with fuzzy search, so we limit the fuzziness to 0 by default to disable it.
+;; If you'd like to enable it, you can set it to a value between 0 and 2.
+;TYPE_BLEVE_MAX_FUZZINESS = 0
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go
index c1ab26569c..728b37fab6 100644
--- a/modules/indexer/code/indexer.go
+++ b/modules/indexer/code/indexer.go
@@ -123,13 +123,12 @@ func Init() {
for _, indexerData := range items {
log.Trace("IndexerData Process Repo: %d", indexerData.RepoID)
if err := index(ctx, indexer, indexerData.RepoID); err != nil {
- unhandled = append(unhandled, indexerData)
if !setting.IsInTesting {
log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err)
}
}
}
- return unhandled
+ return nil // do not re-queue the failed items, otherwise some broken repo will block the queue
}
indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler)
diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go
index d04088531a..f358bbe785 100644
--- a/modules/indexer/code/indexer_test.go
+++ b/modules/indexer/code/indexer_test.go
@@ -15,6 +15,8 @@ import (
"code.gitea.io/gitea/modules/indexer/code/bleve"
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
"code.gitea.io/gitea/modules/indexer/code/internal"
+ "code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/test"
_ "code.gitea.io/gitea/models"
_ "code.gitea.io/gitea/models/actions"
@@ -279,7 +281,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
func TestBleveIndexAndSearch(t *testing.T) {
unittest.PrepareTestEnv(t)
-
+ defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
dir := t.TempDir()
idx := bleve.NewIndexer(dir)
diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go
index a0c3dc4ad4..b6daa9e14b 100644
--- a/modules/indexer/internal/bleve/util.go
+++ b/modules/indexer/internal/bleve/util.go
@@ -9,6 +9,7 @@ import (
"unicode"
"code.gitea.io/gitea/modules/log"
+ "code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
"github.com/blevesearch/bleve/v2"
@@ -54,9 +55,9 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil
}
-// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
-// may be different on two string and they still be considered equivalent.
-// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
+// GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars
+// may be different on two string, and they still be considered equivalent.
+// Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int {
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
tokens := tokenizer.Tokenize([]byte(s))
@@ -85,5 +86,5 @@ func guessFuzzinessByKeyword(s string) int {
return 0
}
}
- return min(maxFuzziness, len(s)/4)
+ return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4)
}
diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go
index 8f7844464e..1a7e4db0f4 100644
--- a/modules/indexer/internal/bleve/util_test.go
+++ b/modules/indexer/internal/bleve/util_test.go
@@ -7,10 +7,15 @@ import (
"fmt"
"testing"
+ "code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/test"
+
"github.com/stretchr/testify/assert"
)
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
+ defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
+
scenarios := []struct {
Input string
Fuzziness int // See util.go for the definition of fuzziness in this particular context
@@ -46,7 +51,7 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
}
for _, scenario := range scenarios {
- t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
+ t.Run(fmt.Sprintf("Fuziniess:%s=%d", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
})
}
diff --git a/modules/setting/indexer.go b/modules/setting/indexer.go
index 34b4eac15b..e34baae012 100644
--- a/modules/setting/indexer.go
+++ b/modules/setting/indexer.go
@@ -31,6 +31,8 @@ var Indexer = struct {
IncludePatterns []*GlobMatcher
ExcludePatterns []*GlobMatcher
ExcludeVendored bool
+
+ TypeBleveMaxFuzzniess int
}{
IssueType: "bleve",
IssuePath: "indexers/issues.bleve",
@@ -88,6 +90,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) {
Indexer.ExcludeVendored = sec.Key("REPO_INDEXER_EXCLUDE_VENDORED").MustBool(true)
Indexer.MaxIndexerFileSize = sec.Key("MAX_FILE_SIZE").MustInt64(1024 * 1024)
Indexer.StartupTimeout = sec.Key("STARTUP_TIMEOUT").MustDuration(30 * time.Second)
+ Indexer.TypeBleveMaxFuzzniess = sec.Key("TYPE_BLEVE_MAX_FUZZINESS").MustInt(0)
}
// IndexerGlobFromString parses a comma separated list of patterns and returns a glob.Glob slice suited for repo indexing
diff --git a/routers/common/codesearch.go b/routers/common/codesearch.go
new file mode 100644
index 0000000000..a14af126e5
--- /dev/null
+++ b/routers/common/codesearch.go
@@ -0,0 +1,39 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package common
+
+import (
+ "code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/services/context"
+)
+
+func PrepareCodeSearch(ctx *context.Context) (ret struct {
+ Keyword string
+ Language string
+ IsFuzzy bool
+},
+) {
+ ret.Language = ctx.FormTrim("l")
+ ret.Keyword = ctx.FormTrim("q")
+
+ fuzzyDefault := setting.Indexer.RepoIndexerEnabled
+ fuzzyAllow := true
+ if setting.Indexer.RepoType == "bleve" && setting.Indexer.TypeBleveMaxFuzzniess == 0 {
+ fuzzyDefault = false
+ fuzzyAllow = false
+ }
+ isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(fuzzyDefault)
+ if isFuzzy && !fuzzyAllow {
+ ctx.Flash.Info("Fuzzy search is disabled by default due to performance reasons")
+ isFuzzy = false
+ }
+
+ ctx.Data["IsBleveFuzzyDisabled"] = true
+ ctx.Data["Keyword"] = ret.Keyword
+ ctx.Data["Language"] = ret.Language
+ ctx.Data["IsFuzzy"] = isFuzzy
+
+ ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
+ return ret
+}
diff --git a/routers/web/explore/code.go b/routers/web/explore/code.go
index 48f890332b..3fca36c9ab 100644
--- a/routers/web/explore/code.go
+++ b/routers/web/explore/code.go
@@ -11,6 +11,7 @@ import (
"code.gitea.io/gitea/modules/base"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/routers/common"
"code.gitea.io/gitea/services/context"
)
@@ -32,18 +33,10 @@ func Code(ctx *context.Context) {
ctx.Data["Title"] = ctx.Tr("explore")
ctx.Data["PageIsExplore"] = true
ctx.Data["PageIsExploreCode"] = true
-
- language := ctx.FormTrim("l")
- keyword := ctx.FormTrim("q")
-
- isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
-
- ctx.Data["Keyword"] = keyword
- ctx.Data["Language"] = language
- ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["PageIsViewCode"] = true
- if keyword == "" {
+ prepareSearch := common.PrepareCodeSearch(ctx)
+ if prepareSearch.Keyword == "" {
ctx.HTML(http.StatusOK, tplExploreCode)
return
}
@@ -80,9 +73,9 @@ func Code(ctx *context.Context) {
if (len(repoIDs) > 0) || isAdmin {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
RepoIDs: repoIDs,
- Keyword: keyword,
- IsKeywordFuzzy: isFuzzy,
- Language: language,
+ Keyword: prepareSearch.Keyword,
+ IsKeywordFuzzy: prepareSearch.IsFuzzy,
+ Language: prepareSearch.Language,
Paginator: &db.ListOptions{
Page: page,
PageSize: setting.UI.RepoSearchPagingNum,
@@ -138,7 +131,7 @@ func Code(ctx *context.Context) {
pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5)
pager.SetDefaultParams(ctx)
- pager.AddParamString("l", language)
+ pager.AddParamString("l", prepareSearch.Language)
ctx.Data["Page"] = pager
ctx.HTML(http.StatusOK, tplExploreCode)
diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go
index 920a865555..d5772ff79d 100644
--- a/routers/web/repo/search.go
+++ b/routers/web/repo/search.go
@@ -12,6 +12,7 @@ import (
"code.gitea.io/gitea/modules/git"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/routers/common"
"code.gitea.io/gitea/services/context"
)
@@ -29,18 +30,9 @@ func indexSettingToGitGrepPathspecList() (list []string) {
// Search render repository search page
func Search(ctx *context.Context) {
- language := ctx.FormTrim("l")
- keyword := ctx.FormTrim("q")
-
- isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
-
- ctx.Data["Keyword"] = keyword
- ctx.Data["Language"] = language
- ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["PageIsViewCode"] = true
- ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
-
- if keyword == "" {
+ prepareSearch := common.PrepareCodeSearch(ctx)
+ if prepareSearch.Keyword == "" {
ctx.HTML(http.StatusOK, tplSearch)
return
}
@@ -57,9 +49,9 @@ func Search(ctx *context.Context) {
var err error
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
RepoIDs: []int64{ctx.Repo.Repository.ID},
- Keyword: keyword,
- IsKeywordFuzzy: isFuzzy,
- Language: language,
+ Keyword: prepareSearch.Keyword,
+ IsKeywordFuzzy: prepareSearch.IsFuzzy,
+ Language: prepareSearch.Language,
Paginator: &db.ListOptions{
Page: page,
PageSize: setting.UI.RepoSearchPagingNum,
@@ -75,9 +67,9 @@ func Search(ctx *context.Context) {
ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx)
}
} else {
- res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{
+ res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, prepareSearch.Keyword, git.GrepOptions{
ContextLineNumber: 1,
- IsFuzzy: isFuzzy,
+ IsFuzzy: prepareSearch.IsFuzzy,
RefName: git.RefNameFromBranch(ctx.Repo.BranchName).String(), // BranchName should be default branch or the first existing branch
PathspecList: indexSettingToGitGrepPathspecList(),
})
@@ -109,7 +101,7 @@ func Search(ctx *context.Context) {
pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5)
pager.SetDefaultParams(ctx)
- pager.AddParamString("l", language)
+ pager.AddParamString("l", prepareSearch.Language)
ctx.Data["Page"] = pager
ctx.HTML(http.StatusOK, tplSearch)
diff --git a/routers/web/user/code.go b/routers/web/user/code.go
index 785c37b124..8f4b4b2d9a 100644
--- a/routers/web/user/code.go
+++ b/routers/web/user/code.go
@@ -11,6 +11,7 @@ import (
"code.gitea.io/gitea/modules/base"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/routers/common"
shared_user "code.gitea.io/gitea/routers/web/shared/user"
"code.gitea.io/gitea/services/context"
)
@@ -34,20 +35,11 @@ func CodeSearch(ctx *context.Context) {
}
ctx.Data["IsPackageEnabled"] = setting.Packages.Enabled
- ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
ctx.Data["Title"] = ctx.Tr("explore.code")
-
- language := ctx.FormTrim("l")
- keyword := ctx.FormTrim("q")
-
- isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true)
-
- ctx.Data["Keyword"] = keyword
- ctx.Data["Language"] = language
- ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["IsCodePage"] = true
- if keyword == "" {
+ prepareSearch := common.PrepareCodeSearch(ctx)
+ if prepareSearch.Keyword == "" {
ctx.HTML(http.StatusOK, tplUserCode)
return
}
@@ -77,9 +69,9 @@ func CodeSearch(ctx *context.Context) {
if len(repoIDs) > 0 {
total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{
RepoIDs: repoIDs,
- Keyword: keyword,
- IsKeywordFuzzy: isFuzzy,
- Language: language,
+ Keyword: prepareSearch.Keyword,
+ IsKeywordFuzzy: prepareSearch.IsFuzzy,
+ Language: prepareSearch.Language,
Paginator: &db.ListOptions{
Page: page,
PageSize: setting.UI.RepoSearchPagingNum,
@@ -122,7 +114,7 @@ func CodeSearch(ctx *context.Context) {
pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5)
pager.SetDefaultParams(ctx)
- pager.AddParamString("l", language)
+ pager.AddParamString("l", prepareSearch.Language)
ctx.Data["Page"] = pager
ctx.HTML(http.StatusOK, tplUserCode)
diff --git a/templates/shared/search/code/search.tmpl b/templates/shared/search/code/search.tmpl
index e49ea47e03..dde45c0fbf 100644
--- a/templates/shared/search/code/search.tmpl
+++ b/templates/shared/search/code/search.tmpl
@@ -2,7 +2,8 @@
{{template "shared/search/combo_fuzzy" dict "Value" .Keyword "Disabled" .CodeIndexerUnavailable "IsFuzzy" .IsFuzzy "Placeholder" (ctx.Locale.Tr "search.code_kind")}}
</form>
<div class="divider"></div>
-<div class="ui user list">
+<div class="ui list">
+ {{template "base/alert" .}}
{{if .CodeIndexerUnavailable}}
<div class="ui error message">
<p>{{ctx.Locale.Tr "search.code_search_unavailable"}}</p>