diff options
author | wxiaoguang <wxiaoguang@gmail.com> | 2025-01-03 05:45:14 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-01-02 21:45:14 +0000 |
commit | e10d222434d441d7847716eaea95577a48e31ca3 (patch) | |
tree | a340c6882c69f69fb3d511b7a76133e50ac8b75a | |
parent | 7a35f90b2971a7d569af14a6426db5b1c6a74c1f (diff) | |
download | gitea-e10d222434d441d7847716eaea95577a48e31ca3.tar.gz gitea-e10d222434d441d7847716eaea95577a48e31ca3.zip |
Fix bleve fuzziness search (#33078) (#33087)
-rw-r--r-- | custom/conf/app.example.ini | 4 | ||||
-rw-r--r-- | modules/indexer/code/indexer.go | 3 | ||||
-rw-r--r-- | modules/indexer/code/indexer_test.go | 4 | ||||
-rw-r--r-- | modules/indexer/internal/bleve/util.go | 9 | ||||
-rw-r--r-- | modules/indexer/internal/bleve/util_test.go | 7 | ||||
-rw-r--r-- | modules/setting/indexer.go | 3 | ||||
-rw-r--r-- | routers/common/codesearch.go | 39 | ||||
-rw-r--r-- | routers/web/explore/code.go | 21 | ||||
-rw-r--r-- | routers/web/repo/search.go | 26 | ||||
-rw-r--r-- | routers/web/user/code.go | 22 | ||||
-rw-r--r-- | templates/shared/search/code/search.tmpl | 3 |
11 files changed, 86 insertions, 55 deletions
diff --git a/custom/conf/app.example.ini b/custom/conf/app.example.ini index 6377ebf9d2..6896b073e1 100644 --- a/custom/conf/app.example.ini +++ b/custom/conf/app.example.ini @@ -1482,6 +1482,10 @@ LEVEL = Info ;REPO_INDEXER_EXCLUDE = ;; ;MAX_FILE_SIZE = 1048576 +;; +;; Bleve engine has performance problems with fuzzy search, so we limit the fuzziness to 0 by default to disable it. +;; If you'd like to enable it, you can set it to a value between 0 and 2. +;TYPE_BLEVE_MAX_FUZZINESS = 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index c1ab26569c..728b37fab6 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -123,13 +123,12 @@ func Init() { for _, indexerData := range items { log.Trace("IndexerData Process Repo: %d", indexerData.RepoID) if err := index(ctx, indexer, indexerData.RepoID); err != nil { - unhandled = append(unhandled, indexerData) if !setting.IsInTesting { log.Error("Codes indexer handler: index error for repo %v: %v", indexerData.RepoID, err) } } } - return unhandled + return nil // do not re-queue the failed items, otherwise some broken repo will block the queue } indexerQueue = queue.CreateUniqueQueue(ctx, "code_indexer", handler) diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index d04088531a..f358bbe785 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -15,6 +15,8 @@ import ( "code.gitea.io/gitea/modules/indexer/code/bleve" "code.gitea.io/gitea/modules/indexer/code/elasticsearch" "code.gitea.io/gitea/modules/indexer/code/internal" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" _ "code.gitea.io/gitea/models" _ "code.gitea.io/gitea/models/actions" @@ -279,7 +281,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { func TestBleveIndexAndSearch(t *testing.T) { unittest.PrepareTestEnv(t) - + defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)() dir := t.TempDir() idx := bleve.NewIndexer(dir) diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index a0c3dc4ad4..b6daa9e14b 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -9,6 +9,7 @@ import ( "unicode" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/util" "github.com/blevesearch/bleve/v2" @@ -54,9 +55,9 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) { return index, 0, nil } -// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars -// may be different on two string and they still be considered equivalent. -// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero. +// GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars +// may be different on two string, and they still be considered equivalent. +// Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero. func GuessFuzzinessByKeyword(s string) int { tokenizer := unicode_tokenizer.NewUnicodeTokenizer() tokens := tokenizer.Tokenize([]byte(s)) @@ -85,5 +86,5 @@ func guessFuzzinessByKeyword(s string) int { return 0 } } - return min(maxFuzziness, len(s)/4) + return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4) } diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go index 8f7844464e..1a7e4db0f4 100644 --- a/modules/indexer/internal/bleve/util_test.go +++ b/modules/indexer/internal/bleve/util_test.go @@ -7,10 +7,15 @@ import ( "fmt" "testing" + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" + "github.com/stretchr/testify/assert" ) func TestBleveGuessFuzzinessByKeyword(t *testing.T) { + defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)() + scenarios := []struct { Input string Fuzziness int // See util.go for the definition of fuzziness in this particular context @@ -46,7 +51,7 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) { } for _, scenario := range scenarios { - t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) { + t.Run(fmt.Sprintf("Fuziniess:%s=%d", scenario.Input, scenario.Fuzziness), func(t *testing.T) { assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input)) }) } diff --git a/modules/setting/indexer.go b/modules/setting/indexer.go index 34b4eac15b..e34baae012 100644 --- a/modules/setting/indexer.go +++ b/modules/setting/indexer.go @@ -31,6 +31,8 @@ var Indexer = struct { IncludePatterns []*GlobMatcher ExcludePatterns []*GlobMatcher ExcludeVendored bool + + TypeBleveMaxFuzzniess int }{ IssueType: "bleve", IssuePath: "indexers/issues.bleve", @@ -88,6 +90,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) { Indexer.ExcludeVendored = sec.Key("REPO_INDEXER_EXCLUDE_VENDORED").MustBool(true) Indexer.MaxIndexerFileSize = sec.Key("MAX_FILE_SIZE").MustInt64(1024 * 1024) Indexer.StartupTimeout = sec.Key("STARTUP_TIMEOUT").MustDuration(30 * time.Second) + Indexer.TypeBleveMaxFuzzniess = sec.Key("TYPE_BLEVE_MAX_FUZZINESS").MustInt(0) } // IndexerGlobFromString parses a comma separated list of patterns and returns a glob.Glob slice suited for repo indexing diff --git a/routers/common/codesearch.go b/routers/common/codesearch.go new file mode 100644 index 0000000000..a14af126e5 --- /dev/null +++ b/routers/common/codesearch.go @@ -0,0 +1,39 @@ +// Copyright 2024 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package common + +import ( + "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/services/context" +) + +func PrepareCodeSearch(ctx *context.Context) (ret struct { + Keyword string + Language string + IsFuzzy bool +}, +) { + ret.Language = ctx.FormTrim("l") + ret.Keyword = ctx.FormTrim("q") + + fuzzyDefault := setting.Indexer.RepoIndexerEnabled + fuzzyAllow := true + if setting.Indexer.RepoType == "bleve" && setting.Indexer.TypeBleveMaxFuzzniess == 0 { + fuzzyDefault = false + fuzzyAllow = false + } + isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(fuzzyDefault) + if isFuzzy && !fuzzyAllow { + ctx.Flash.Info("Fuzzy search is disabled by default due to performance reasons") + isFuzzy = false + } + + ctx.Data["IsBleveFuzzyDisabled"] = true + ctx.Data["Keyword"] = ret.Keyword + ctx.Data["Language"] = ret.Language + ctx.Data["IsFuzzy"] = isFuzzy + + ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled + return ret +} diff --git a/routers/web/explore/code.go b/routers/web/explore/code.go index 48f890332b..3fca36c9ab 100644 --- a/routers/web/explore/code.go +++ b/routers/web/explore/code.go @@ -11,6 +11,7 @@ import ( "code.gitea.io/gitea/modules/base" code_indexer "code.gitea.io/gitea/modules/indexer/code" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/routers/common" "code.gitea.io/gitea/services/context" ) @@ -32,18 +33,10 @@ func Code(ctx *context.Context) { ctx.Data["Title"] = ctx.Tr("explore") ctx.Data["PageIsExplore"] = true ctx.Data["PageIsExploreCode"] = true - - language := ctx.FormTrim("l") - keyword := ctx.FormTrim("q") - - isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true) - - ctx.Data["Keyword"] = keyword - ctx.Data["Language"] = language - ctx.Data["IsFuzzy"] = isFuzzy ctx.Data["PageIsViewCode"] = true - if keyword == "" { + prepareSearch := common.PrepareCodeSearch(ctx) + if prepareSearch.Keyword == "" { ctx.HTML(http.StatusOK, tplExploreCode) return } @@ -80,9 +73,9 @@ func Code(ctx *context.Context) { if (len(repoIDs) > 0) || isAdmin { total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{ RepoIDs: repoIDs, - Keyword: keyword, - IsKeywordFuzzy: isFuzzy, - Language: language, + Keyword: prepareSearch.Keyword, + IsKeywordFuzzy: prepareSearch.IsFuzzy, + Language: prepareSearch.Language, Paginator: &db.ListOptions{ Page: page, PageSize: setting.UI.RepoSearchPagingNum, @@ -138,7 +131,7 @@ func Code(ctx *context.Context) { pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5) pager.SetDefaultParams(ctx) - pager.AddParamString("l", language) + pager.AddParamString("l", prepareSearch.Language) ctx.Data["Page"] = pager ctx.HTML(http.StatusOK, tplExploreCode) diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go index 920a865555..d5772ff79d 100644 --- a/routers/web/repo/search.go +++ b/routers/web/repo/search.go @@ -12,6 +12,7 @@ import ( "code.gitea.io/gitea/modules/git" code_indexer "code.gitea.io/gitea/modules/indexer/code" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/routers/common" "code.gitea.io/gitea/services/context" ) @@ -29,18 +30,9 @@ func indexSettingToGitGrepPathspecList() (list []string) { // Search render repository search page func Search(ctx *context.Context) { - language := ctx.FormTrim("l") - keyword := ctx.FormTrim("q") - - isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true) - - ctx.Data["Keyword"] = keyword - ctx.Data["Language"] = language - ctx.Data["IsFuzzy"] = isFuzzy ctx.Data["PageIsViewCode"] = true - ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled - - if keyword == "" { + prepareSearch := common.PrepareCodeSearch(ctx) + if prepareSearch.Keyword == "" { ctx.HTML(http.StatusOK, tplSearch) return } @@ -57,9 +49,9 @@ func Search(ctx *context.Context) { var err error total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{ RepoIDs: []int64{ctx.Repo.Repository.ID}, - Keyword: keyword, - IsKeywordFuzzy: isFuzzy, - Language: language, + Keyword: prepareSearch.Keyword, + IsKeywordFuzzy: prepareSearch.IsFuzzy, + Language: prepareSearch.Language, Paginator: &db.ListOptions{ Page: page, PageSize: setting.UI.RepoSearchPagingNum, @@ -75,9 +67,9 @@ func Search(ctx *context.Context) { ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx) } } else { - res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, keyword, git.GrepOptions{ + res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, prepareSearch.Keyword, git.GrepOptions{ ContextLineNumber: 1, - IsFuzzy: isFuzzy, + IsFuzzy: prepareSearch.IsFuzzy, RefName: git.RefNameFromBranch(ctx.Repo.BranchName).String(), // BranchName should be default branch or the first existing branch PathspecList: indexSettingToGitGrepPathspecList(), }) @@ -109,7 +101,7 @@ func Search(ctx *context.Context) { pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5) pager.SetDefaultParams(ctx) - pager.AddParamString("l", language) + pager.AddParamString("l", prepareSearch.Language) ctx.Data["Page"] = pager ctx.HTML(http.StatusOK, tplSearch) diff --git a/routers/web/user/code.go b/routers/web/user/code.go index 785c37b124..8f4b4b2d9a 100644 --- a/routers/web/user/code.go +++ b/routers/web/user/code.go @@ -11,6 +11,7 @@ import ( "code.gitea.io/gitea/modules/base" code_indexer "code.gitea.io/gitea/modules/indexer/code" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/routers/common" shared_user "code.gitea.io/gitea/routers/web/shared/user" "code.gitea.io/gitea/services/context" ) @@ -34,20 +35,11 @@ func CodeSearch(ctx *context.Context) { } ctx.Data["IsPackageEnabled"] = setting.Packages.Enabled - ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled ctx.Data["Title"] = ctx.Tr("explore.code") - - language := ctx.FormTrim("l") - keyword := ctx.FormTrim("q") - - isFuzzy := ctx.FormOptionalBool("fuzzy").ValueOrDefault(true) - - ctx.Data["Keyword"] = keyword - ctx.Data["Language"] = language - ctx.Data["IsFuzzy"] = isFuzzy ctx.Data["IsCodePage"] = true - if keyword == "" { + prepareSearch := common.PrepareCodeSearch(ctx) + if prepareSearch.Keyword == "" { ctx.HTML(http.StatusOK, tplUserCode) return } @@ -77,9 +69,9 @@ func CodeSearch(ctx *context.Context) { if len(repoIDs) > 0 { total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, &code_indexer.SearchOptions{ RepoIDs: repoIDs, - Keyword: keyword, - IsKeywordFuzzy: isFuzzy, - Language: language, + Keyword: prepareSearch.Keyword, + IsKeywordFuzzy: prepareSearch.IsFuzzy, + Language: prepareSearch.Language, Paginator: &db.ListOptions{ Page: page, PageSize: setting.UI.RepoSearchPagingNum, @@ -122,7 +114,7 @@ func CodeSearch(ctx *context.Context) { pager := context.NewPagination(total, setting.UI.RepoSearchPagingNum, page, 5) pager.SetDefaultParams(ctx) - pager.AddParamString("l", language) + pager.AddParamString("l", prepareSearch.Language) ctx.Data["Page"] = pager ctx.HTML(http.StatusOK, tplUserCode) diff --git a/templates/shared/search/code/search.tmpl b/templates/shared/search/code/search.tmpl index e49ea47e03..dde45c0fbf 100644 --- a/templates/shared/search/code/search.tmpl +++ b/templates/shared/search/code/search.tmpl @@ -2,7 +2,8 @@ {{template "shared/search/combo_fuzzy" dict "Value" .Keyword "Disabled" .CodeIndexerUnavailable "IsFuzzy" .IsFuzzy "Placeholder" (ctx.Locale.Tr "search.code_kind")}} </form> <div class="divider"></div> -<div class="ui user list"> +<div class="ui list"> + {{template "base/alert" .}} {{if .CodeIndexerUnavailable}} <div class="ui error message"> <p>{{ctx.Locale.Tr "search.code_search_unavailable"}}</p> |