diff options
author | wxiaoguang <wxiaoguang@gmail.com> | 2025-03-13 11:07:48 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-03-13 11:07:48 +0800 |
commit | 403775e74ed11097b30a9a683b2253c0967c0dcd (patch) | |
tree | 3d5d4c7687428cad146f7650822c1f0a2af72240 /modules/indexer/code | |
parent | cd10456664ce763212f6841be7839ab5d4229c18 (diff) | |
download | gitea-403775e74ed11097b30a9a683b2253c0967c0dcd.tar.gz gitea-403775e74ed11097b30a9a683b2253c0967c0dcd.zip |
Improve issue & code search (#33860)
Each "indexer" should provide the "search modes" they support by
themselves. And we need to remove the "fuzzy" search for code.
Diffstat (limited to 'modules/indexer/code')
-rw-r--r-- | modules/indexer/code/bleve/bleve.go | 20 | ||||
-rw-r--r-- | modules/indexer/code/elasticsearch/elasticsearch.go | 19 | ||||
-rw-r--r-- | modules/indexer/code/gitgrep/gitgrep.go | 12 | ||||
-rw-r--r-- | modules/indexer/code/indexer.go | 9 | ||||
-rw-r--r-- | modules/indexer/code/indexer_test.go | 35 | ||||
-rw-r--r-- | modules/indexer/code/internal/indexer.go | 8 | ||||
-rw-r--r-- | modules/indexer/code/internal/util.go | 12 | ||||
-rw-r--r-- | modules/indexer/code/internal/util_test.go | 30 | ||||
-rw-r--r-- | modules/indexer/code/search.go | 1 |
9 files changed, 66 insertions, 80 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index fdb7866145..52a934d4ff 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -17,6 +17,7 @@ import ( "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/gitrepo" + "code.gitea.io/gitea/modules/indexer" path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path" "code.gitea.io/gitea/modules/indexer/code/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal" @@ -136,6 +137,10 @@ type Indexer struct { indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much } +func (b *Indexer) SupportedSearchModes() []indexer.SearchMode { + return indexer.SearchModesExactWords() +} + // NewIndexer creates a new bleve local indexer func NewIndexer(indexDir string) *Indexer { inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping) @@ -267,19 +272,18 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int pathQuery.FieldVal = "Filename" pathQuery.SetBoost(10) - keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) - if isPhrase { - q := bleve.NewMatchPhraseQuery(keywordAsPhrase) + if opts.SearchMode == indexer.SearchModeExact { + q := bleve.NewMatchPhraseQuery(opts.Keyword) q.FieldVal = "Content" - if opts.IsKeywordFuzzy { - q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(keywordAsPhrase) - } contentQuery = q - } else { + } else /* words */ { q := bleve.NewMatchQuery(opts.Keyword) q.FieldVal = "Content" - if opts.IsKeywordFuzzy { + if opts.SearchMode == indexer.SearchModeFuzzy { + // this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case. q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) + } else { + q.Operator = query.MatchQueryOperatorAnd } contentQuery = q } diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 9abece921e..354a8334fb 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -16,6 +16,7 @@ import ( "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/gitrepo" + "code.gitea.io/gitea/modules/indexer" "code.gitea.io/gitea/modules/indexer/code/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal" inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch" @@ -24,7 +25,6 @@ import ( "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/typesniffer" - "code.gitea.io/gitea/modules/util" "github.com/go-enry/go-enry/v2" "github.com/olivere/elastic/v7" @@ -46,6 +46,10 @@ type Indexer struct { indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much } +func (b *Indexer) SupportedSearchModes() []indexer.SearchMode { + return indexer.SearchModesExactWords() +} + // NewIndexer creates a new elasticsearch indexer func NewIndexer(url, indexerName string) *Indexer { inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping) @@ -361,15 +365,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan // Search searches for codes and language stats by given conditions. func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { var contentQuery elastic.Query - keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) - if isPhrase { - contentQuery = elastic.NewMatchPhraseQuery("content", keywordAsPhrase) - } else { - // TODO: this is the old logic, but not really using "fuzziness" - // * IsKeywordFuzzy=true: "best_fields" - // * IsKeywordFuzzy=false: "phrase_prefix" - contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword). - Type(util.Iif(opts.IsKeywordFuzzy, esMultiMatchTypeBestFields, esMultiMatchTypePhrasePrefix)) + if opts.SearchMode == indexer.SearchModeExact { + contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword) + } else /* words */ { + contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and") } kwQuery := elastic.NewBoolQuery().Should( contentQuery, diff --git a/modules/indexer/code/gitgrep/gitgrep.go b/modules/indexer/code/gitgrep/gitgrep.go index a85c9d02a5..093c189ba3 100644 --- a/modules/indexer/code/gitgrep/gitgrep.go +++ b/modules/indexer/code/gitgrep/gitgrep.go @@ -9,6 +9,7 @@ import ( "strings" "code.gitea.io/gitea/modules/git" + "code.gitea.io/gitea/modules/indexer" code_indexer "code.gitea.io/gitea/modules/indexer/code" "code.gitea.io/gitea/modules/setting" ) @@ -23,11 +24,16 @@ func indexSettingToGitGrepPathspecList() (list []string) { return list } -func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, isFuzzy bool) (searchResults []*code_indexer.Result, total int, err error) { - // TODO: it should also respect ParseKeywordAsPhrase and clarify the "fuzzy" behavior +func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int, err error) { + grepMode := git.GrepModeWords + if searchMode == indexer.SearchModeExact { + grepMode = git.GrepModeExact + } else if searchMode == indexer.SearchModeRegexp { + grepMode = git.GrepModeRegexp + } res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{ ContextLineNumber: 1, - IsFuzzy: isFuzzy, + GrepMode: grepMode, RefName: ref.String(), PathspecList: indexSettingToGitGrepPathspecList(), }) diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index 38fd10dae7..6035ddfe95 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -14,6 +14,7 @@ import ( "code.gitea.io/gitea/models/db" repo_model "code.gitea.io/gitea/models/repo" "code.gitea.io/gitea/modules/graceful" + "code.gitea.io/gitea/modules/indexer" "code.gitea.io/gitea/modules/indexer/code/bleve" "code.gitea.io/gitea/modules/indexer/code/elasticsearch" "code.gitea.io/gitea/modules/indexer/code/internal" @@ -302,3 +303,11 @@ func populateRepoIndexer(ctx context.Context) { } log.Info("Done (re)populating the repo indexer with existing repositories") } + +func SupportedSearchModes() []indexer.SearchMode { + gi := globalIndexer.Load() + if gi == nil { + return nil + } + return (*gi).SupportedSearchModes() +} diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go index 43cf8ff254..96516166a0 100644 --- a/modules/indexer/code/indexer_test.go +++ b/modules/indexer/code/indexer_test.go @@ -11,6 +11,7 @@ import ( "code.gitea.io/gitea/models/db" "code.gitea.io/gitea/models/unittest" + indexer_module "code.gitea.io/gitea/modules/indexer" "code.gitea.io/gitea/modules/indexer/code/bleve" "code.gitea.io/gitea/modules/indexer/code/elasticsearch" "code.gitea.io/gitea/modules/indexer/code/internal" @@ -39,10 +40,11 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer)) keywords := []struct { - RepoIDs []int64 - Keyword string - Langs int - Results []codeSearchResult + RepoIDs []int64 + Keyword string + Langs int + SearchMode indexer_module.SearchModeType + Results []codeSearchResult }{ // Search for an exact match on the contents of a file // This scenario yields a single result (the file README.md on the repo '1') @@ -183,9 +185,10 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, // Search for matches on the contents of files regardless of case. { - RepoIDs: nil, - Keyword: "dESCRIPTION", - Langs: 1, + RepoIDs: nil, + Keyword: "dESCRIPTION", + Langs: 1, + SearchMode: indexer_module.SearchModeFuzzy, Results: []codeSearchResult{ { Filename: "README.md", @@ -193,7 +196,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, - // Search for an exact match on the filename within the repo '62' (case insenstive). + // Search for an exact match on the filename within the repo '62' (case-insensitive). // This scenario yields a single result (the file avocado.md on the repo '62') { RepoIDs: []int64{62}, @@ -206,7 +209,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, - // Search for matches on the contents of files when the criteria is a expression. + // Search for matches on the contents of files when the criteria are an expression. { RepoIDs: []int64{62}, Keyword: "console.log", @@ -218,7 +221,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { }, }, }, - // Search for matches on the contents of files when the criteria is part of a expression. + // Search for matches on the contents of files when the criteria are parts of an expression. { RepoIDs: []int64{62}, Keyword: "log", @@ -235,16 +238,16 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) { for _, kw := range keywords { t.Run(kw.Keyword, func(t *testing.T) { total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{ - RepoIDs: kw.RepoIDs, - Keyword: kw.Keyword, + RepoIDs: kw.RepoIDs, + Keyword: kw.Keyword, + SearchMode: kw.SearchMode, Paginator: &db.ListOptions{ Page: 1, PageSize: 10, }, - IsKeywordFuzzy: true, }) - assert.NoError(t, err) - assert.Len(t, langs, kw.Langs) + require.NoError(t, err) + require.Len(t, langs, kw.Langs) hits := make([]codeSearchResult, 0, len(res)) @@ -289,7 +292,7 @@ func TestBleveIndexAndSearch(t *testing.T) { _, err := idx.Init(t.Context()) require.NoError(t, err) - testIndexer("beleve", t, idx) + testIndexer("bleve", t, idx) } func TestESIndexAndSearch(t *testing.T) { diff --git a/modules/indexer/code/internal/indexer.go b/modules/indexer/code/internal/indexer.go index c259fcd26e..6c9a8af635 100644 --- a/modules/indexer/code/internal/indexer.go +++ b/modules/indexer/code/internal/indexer.go @@ -9,6 +9,7 @@ import ( "code.gitea.io/gitea/models/db" repo_model "code.gitea.io/gitea/models/repo" + "code.gitea.io/gitea/modules/indexer" "code.gitea.io/gitea/modules/indexer/internal" ) @@ -18,6 +19,7 @@ type Indexer interface { Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error Delete(ctx context.Context, repoID int64) error Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error) + SupportedSearchModes() []indexer.SearchMode } type SearchOptions struct { @@ -25,7 +27,7 @@ type SearchOptions struct { Keyword string Language string - IsKeywordFuzzy bool + SearchMode indexer.SearchModeType db.Paginator } @@ -41,6 +43,10 @@ type dummyIndexer struct { internal.Indexer } +func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode { + return nil +} + func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error { return fmt.Errorf("indexer is not ready") } diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go index 46e631166d..fa958be473 100644 --- a/modules/indexer/code/internal/util.go +++ b/modules/indexer/code/internal/util.go @@ -10,9 +10,7 @@ import ( "code.gitea.io/gitea/modules/log" ) -const ( - filenameMatchNumberOfLines = 7 // Copied from github search -) +const filenameMatchNumberOfLines = 7 // Copied from GitHub search func FilenameIndexerID(repoID int64, filename string) string { return internal.Base36(repoID) + "_" + filename @@ -48,11 +46,3 @@ func FilenameMatchIndexPos(content string) (int, int) { } return 0, len(content) } - -func ParseKeywordAsPhrase(keyword string) (string, bool) { - if strings.HasPrefix(keyword, `"`) && strings.HasSuffix(keyword, `"`) && len(keyword) > 1 { - // only remove the prefix and suffix quotes, no need to decode the content at the moment - return keyword[1 : len(keyword)-1], true - } - return "", false -} diff --git a/modules/indexer/code/internal/util_test.go b/modules/indexer/code/internal/util_test.go deleted file mode 100644 index 457936296b..0000000000 --- a/modules/indexer/code/internal/util_test.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2025 The Gitea Authors. All rights reserved. -// SPDX-License-Identifier: MIT - -package internal - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestParseKeywordAsPhrase(t *testing.T) { - cases := []struct { - keyword string - phrase string - isPhrase bool - }{ - {``, "", false}, - {`a`, "", false}, - {`"`, "", false}, - {`"a`, "", false}, - {`"a"`, "a", true}, - {`""\"""`, `"\""`, true}, - } - for _, c := range cases { - phrase, isPhrase := ParseKeywordAsPhrase(c.keyword) - assert.Equal(t, c.phrase, phrase, "keyword=%q", c.keyword) - assert.Equal(t, c.isPhrase, isPhrase, "keyword=%q", c.keyword) - } -} diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go index 74c957dde6..e37aff8e59 100644 --- a/modules/indexer/code/search.go +++ b/modules/indexer/code/search.go @@ -129,7 +129,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res } // PerformSearch perform a search on a repository -// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2 func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) { if opts == nil || len(opts.Keyword) == 0 { return 0, nil, nil, nil |