aboutsummaryrefslogtreecommitdiffstats
path: root/modules/indexer/code
diff options
context:
space:
mode:
authorwxiaoguang <wxiaoguang@gmail.com>2025-03-13 11:07:48 +0800
committerGitHub <noreply@github.com>2025-03-13 11:07:48 +0800
commit403775e74ed11097b30a9a683b2253c0967c0dcd (patch)
tree3d5d4c7687428cad146f7650822c1f0a2af72240 /modules/indexer/code
parentcd10456664ce763212f6841be7839ab5d4229c18 (diff)
downloadgitea-403775e74ed11097b30a9a683b2253c0967c0dcd.tar.gz
gitea-403775e74ed11097b30a9a683b2253c0967c0dcd.zip
Improve issue & code search (#33860)
Each "indexer" should provide the "search modes" they support by themselves. And we need to remove the "fuzzy" search for code.
Diffstat (limited to 'modules/indexer/code')
-rw-r--r--modules/indexer/code/bleve/bleve.go20
-rw-r--r--modules/indexer/code/elasticsearch/elasticsearch.go19
-rw-r--r--modules/indexer/code/gitgrep/gitgrep.go12
-rw-r--r--modules/indexer/code/indexer.go9
-rw-r--r--modules/indexer/code/indexer_test.go35
-rw-r--r--modules/indexer/code/internal/indexer.go8
-rw-r--r--modules/indexer/code/internal/util.go12
-rw-r--r--modules/indexer/code/internal/util_test.go30
-rw-r--r--modules/indexer/code/search.go1
9 files changed, 66 insertions, 80 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index fdb7866145..52a934d4ff 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
+ "code.gitea.io/gitea/modules/indexer"
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
@@ -136,6 +137,10 @@ type Indexer struct {
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
}
+func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
+ return indexer.SearchModesExactWords()
+}
+
// NewIndexer creates a new bleve local indexer
func NewIndexer(indexDir string) *Indexer {
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
@@ -267,19 +272,18 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
pathQuery.FieldVal = "Filename"
pathQuery.SetBoost(10)
- keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword)
- if isPhrase {
- q := bleve.NewMatchPhraseQuery(keywordAsPhrase)
+ if opts.SearchMode == indexer.SearchModeExact {
+ q := bleve.NewMatchPhraseQuery(opts.Keyword)
q.FieldVal = "Content"
- if opts.IsKeywordFuzzy {
- q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(keywordAsPhrase)
- }
contentQuery = q
- } else {
+ } else /* words */ {
q := bleve.NewMatchQuery(opts.Keyword)
q.FieldVal = "Content"
- if opts.IsKeywordFuzzy {
+ if opts.SearchMode == indexer.SearchModeFuzzy {
+ // this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
+ } else {
+ q.Operator = query.MatchQueryOperatorAnd
}
contentQuery = q
}
diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go
index 9abece921e..354a8334fb 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -16,6 +16,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
+ "code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
@@ -24,7 +25,6 @@ import (
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/typesniffer"
- "code.gitea.io/gitea/modules/util"
"github.com/go-enry/go-enry/v2"
"github.com/olivere/elastic/v7"
@@ -46,6 +46,10 @@ type Indexer struct {
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
}
+func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
+ return indexer.SearchModesExactWords()
+}
+
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
@@ -361,15 +365,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var contentQuery elastic.Query
- keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword)
- if isPhrase {
- contentQuery = elastic.NewMatchPhraseQuery("content", keywordAsPhrase)
- } else {
- // TODO: this is the old logic, but not really using "fuzziness"
- // * IsKeywordFuzzy=true: "best_fields"
- // * IsKeywordFuzzy=false: "phrase_prefix"
- contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).
- Type(util.Iif(opts.IsKeywordFuzzy, esMultiMatchTypeBestFields, esMultiMatchTypePhrasePrefix))
+ if opts.SearchMode == indexer.SearchModeExact {
+ contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword)
+ } else /* words */ {
+ contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and")
}
kwQuery := elastic.NewBoolQuery().Should(
contentQuery,
diff --git a/modules/indexer/code/gitgrep/gitgrep.go b/modules/indexer/code/gitgrep/gitgrep.go
index a85c9d02a5..093c189ba3 100644
--- a/modules/indexer/code/gitgrep/gitgrep.go
+++ b/modules/indexer/code/gitgrep/gitgrep.go
@@ -9,6 +9,7 @@ import (
"strings"
"code.gitea.io/gitea/modules/git"
+ "code.gitea.io/gitea/modules/indexer"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
)
@@ -23,11 +24,16 @@ func indexSettingToGitGrepPathspecList() (list []string) {
return list
}
-func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, isFuzzy bool) (searchResults []*code_indexer.Result, total int, err error) {
- // TODO: it should also respect ParseKeywordAsPhrase and clarify the "fuzzy" behavior
+func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int, err error) {
+ grepMode := git.GrepModeWords
+ if searchMode == indexer.SearchModeExact {
+ grepMode = git.GrepModeExact
+ } else if searchMode == indexer.SearchModeRegexp {
+ grepMode = git.GrepModeRegexp
+ }
res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{
ContextLineNumber: 1,
- IsFuzzy: isFuzzy,
+ GrepMode: grepMode,
RefName: ref.String(),
PathspecList: indexSettingToGitGrepPathspecList(),
})
diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go
index 38fd10dae7..6035ddfe95 100644
--- a/modules/indexer/code/indexer.go
+++ b/modules/indexer/code/indexer.go
@@ -14,6 +14,7 @@ import (
"code.gitea.io/gitea/models/db"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/graceful"
+ "code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/code/bleve"
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
"code.gitea.io/gitea/modules/indexer/code/internal"
@@ -302,3 +303,11 @@ func populateRepoIndexer(ctx context.Context) {
}
log.Info("Done (re)populating the repo indexer with existing repositories")
}
+
+func SupportedSearchModes() []indexer.SearchMode {
+ gi := globalIndexer.Load()
+ if gi == nil {
+ return nil
+ }
+ return (*gi).SupportedSearchModes()
+}
diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go
index 43cf8ff254..96516166a0 100644
--- a/modules/indexer/code/indexer_test.go
+++ b/modules/indexer/code/indexer_test.go
@@ -11,6 +11,7 @@ import (
"code.gitea.io/gitea/models/db"
"code.gitea.io/gitea/models/unittest"
+ indexer_module "code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/code/bleve"
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
"code.gitea.io/gitea/modules/indexer/code/internal"
@@ -39,10 +40,11 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer))
keywords := []struct {
- RepoIDs []int64
- Keyword string
- Langs int
- Results []codeSearchResult
+ RepoIDs []int64
+ Keyword string
+ Langs int
+ SearchMode indexer_module.SearchModeType
+ Results []codeSearchResult
}{
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
@@ -183,9 +185,10 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
// Search for matches on the contents of files regardless of case.
{
- RepoIDs: nil,
- Keyword: "dESCRIPTION",
- Langs: 1,
+ RepoIDs: nil,
+ Keyword: "dESCRIPTION",
+ Langs: 1,
+ SearchMode: indexer_module.SearchModeFuzzy,
Results: []codeSearchResult{
{
Filename: "README.md",
@@ -193,7 +196,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
- // Search for an exact match on the filename within the repo '62' (case insenstive).
+ // Search for an exact match on the filename within the repo '62' (case-insensitive).
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
@@ -206,7 +209,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
- // Search for matches on the contents of files when the criteria is a expression.
+ // Search for matches on the contents of files when the criteria are an expression.
{
RepoIDs: []int64{62},
Keyword: "console.log",
@@ -218,7 +221,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
- // Search for matches on the contents of files when the criteria is part of a expression.
+ // Search for matches on the contents of files when the criteria are parts of an expression.
{
RepoIDs: []int64{62},
Keyword: "log",
@@ -235,16 +238,16 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) {
total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{
- RepoIDs: kw.RepoIDs,
- Keyword: kw.Keyword,
+ RepoIDs: kw.RepoIDs,
+ Keyword: kw.Keyword,
+ SearchMode: kw.SearchMode,
Paginator: &db.ListOptions{
Page: 1,
PageSize: 10,
},
- IsKeywordFuzzy: true,
})
- assert.NoError(t, err)
- assert.Len(t, langs, kw.Langs)
+ require.NoError(t, err)
+ require.Len(t, langs, kw.Langs)
hits := make([]codeSearchResult, 0, len(res))
@@ -289,7 +292,7 @@ func TestBleveIndexAndSearch(t *testing.T) {
_, err := idx.Init(t.Context())
require.NoError(t, err)
- testIndexer("beleve", t, idx)
+ testIndexer("bleve", t, idx)
}
func TestESIndexAndSearch(t *testing.T) {
diff --git a/modules/indexer/code/internal/indexer.go b/modules/indexer/code/internal/indexer.go
index c259fcd26e..6c9a8af635 100644
--- a/modules/indexer/code/internal/indexer.go
+++ b/modules/indexer/code/internal/indexer.go
@@ -9,6 +9,7 @@ import (
"code.gitea.io/gitea/models/db"
repo_model "code.gitea.io/gitea/models/repo"
+ "code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/internal"
)
@@ -18,6 +19,7 @@ type Indexer interface {
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
+ SupportedSearchModes() []indexer.SearchMode
}
type SearchOptions struct {
@@ -25,7 +27,7 @@ type SearchOptions struct {
Keyword string
Language string
- IsKeywordFuzzy bool
+ SearchMode indexer.SearchModeType
db.Paginator
}
@@ -41,6 +43,10 @@ type dummyIndexer struct {
internal.Indexer
}
+func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode {
+ return nil
+}
+
func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error {
return fmt.Errorf("indexer is not ready")
}
diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go
index 46e631166d..fa958be473 100644
--- a/modules/indexer/code/internal/util.go
+++ b/modules/indexer/code/internal/util.go
@@ -10,9 +10,7 @@ import (
"code.gitea.io/gitea/modules/log"
)
-const (
- filenameMatchNumberOfLines = 7 // Copied from github search
-)
+const filenameMatchNumberOfLines = 7 // Copied from GitHub search
func FilenameIndexerID(repoID int64, filename string) string {
return internal.Base36(repoID) + "_" + filename
@@ -48,11 +46,3 @@ func FilenameMatchIndexPos(content string) (int, int) {
}
return 0, len(content)
}
-
-func ParseKeywordAsPhrase(keyword string) (string, bool) {
- if strings.HasPrefix(keyword, `"`) && strings.HasSuffix(keyword, `"`) && len(keyword) > 1 {
- // only remove the prefix and suffix quotes, no need to decode the content at the moment
- return keyword[1 : len(keyword)-1], true
- }
- return "", false
-}
diff --git a/modules/indexer/code/internal/util_test.go b/modules/indexer/code/internal/util_test.go
deleted file mode 100644
index 457936296b..0000000000
--- a/modules/indexer/code/internal/util_test.go
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2025 The Gitea Authors. All rights reserved.
-// SPDX-License-Identifier: MIT
-
-package internal
-
-import (
- "testing"
-
- "github.com/stretchr/testify/assert"
-)
-
-func TestParseKeywordAsPhrase(t *testing.T) {
- cases := []struct {
- keyword string
- phrase string
- isPhrase bool
- }{
- {``, "", false},
- {`a`, "", false},
- {`"`, "", false},
- {`"a`, "", false},
- {`"a"`, "a", true},
- {`""\"""`, `"\""`, true},
- }
- for _, c := range cases {
- phrase, isPhrase := ParseKeywordAsPhrase(c.keyword)
- assert.Equal(t, c.phrase, phrase, "keyword=%q", c.keyword)
- assert.Equal(t, c.isPhrase, isPhrase, "keyword=%q", c.keyword)
- }
-}
diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go
index 74c957dde6..e37aff8e59 100644
--- a/modules/indexer/code/search.go
+++ b/modules/indexer/code/search.go
@@ -129,7 +129,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
}
// PerformSearch perform a search on a repository
-// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) {
if opts == nil || len(opts.Keyword) == 0 {
return 0, nil, nil, nil