diff options
Diffstat (limited to 'modules/indexer/code/bleve/bleve.go')
-rw-r--r-- | modules/indexer/code/bleve/bleve.go | 48 |
1 files changed, 31 insertions, 17 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 772317fa59..70f0995a01 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -16,7 +16,7 @@ import ( "code.gitea.io/gitea/modules/analyze" "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" - "code.gitea.io/gitea/modules/gitrepo" + "code.gitea.io/gitea/modules/indexer" path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path" "code.gitea.io/gitea/modules/indexer/code/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal" @@ -24,11 +24,11 @@ import ( "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/typesniffer" + "code.gitea.io/gitea/modules/util" "github.com/blevesearch/bleve/v2" analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom" analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" - "github.com/blevesearch/bleve/v2/analysis/token/camelcase" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" "github.com/blevesearch/bleve/v2/analysis/tokenizer/letter" @@ -70,7 +70,7 @@ const ( filenameIndexerAnalyzer = "filenameIndexerAnalyzer" filenameIndexerTokenizer = "filenameIndexerTokenizer" repoIndexerDocType = "repoIndexerDocType" - repoIndexerLatestVersion = 8 + repoIndexerLatestVersion = 9 ) // generateBleveIndexMapping generates a bleve index mapping for the repo indexer @@ -107,7 +107,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { "type": analyzer_custom.Name, "char_filters": []string{}, "tokenizer": letter.Name, - "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, + "token_filters": []string{unicodeNormalizeName, lowercase.Name}, }); err != nil { return nil, err } @@ -136,6 +136,10 @@ type Indexer struct { indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much } +func (b *Indexer) SupportedSearchModes() []indexer.SearchMode { + return indexer.SearchModesExactWords() +} + // NewIndexer creates a new bleve local indexer func NewIndexer(indexDir string) *Indexer { inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping) @@ -158,7 +162,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro var err error if !update.Sized { var stdout string - stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()}) + stdout, _, err = git.NewCommand("cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(ctx, &git.RunOpts{Dir: repo.RepoPath()}) if err != nil { return err } @@ -185,7 +189,8 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro return err } else if !typesniffer.DetectContentType(fileContents).IsText() { // FIXME: UTF-16 files will probably fail here - return nil + // Even if the file is not recognized as a "text file", we could still put its name into the indexers to make the filename become searchable, while leave the content to empty. + fileContents = nil } if _, err = batchReader.Discard(1); err != nil { @@ -211,12 +216,7 @@ func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error { batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize) if len(changes.Updates) > 0 { - r, err := gitrepo.OpenRepository(ctx, repo) - if err != nil { - return err - } - defer r.Close() - gitBatch, err := r.NewBatch(ctx) + gitBatch, err := git.NewBatch(ctx, repo.RepoPath()) if err != nil { return err } @@ -260,17 +260,31 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int var ( indexerQuery query.Query keywordQuery query.Query + contentQuery query.Query ) pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword)) pathQuery.FieldVal = "Filename" pathQuery.SetBoost(10) - contentQuery := bleve.NewMatchQuery(opts.Keyword) - contentQuery.FieldVal = "Content" - - if opts.IsKeywordFuzzy { - contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) + searchMode := util.IfZero(opts.SearchMode, b.SupportedSearchModes()[0].ModeValue) + if searchMode == indexer.SearchModeExact { + // 1.21 used NewPrefixQuery, but it seems not working well, and later releases changed to NewMatchPhraseQuery + q := bleve.NewMatchPhraseQuery(opts.Keyword) + q.Analyzer = repoIndexerAnalyzer + q.FieldVal = "Content" + contentQuery = q + } else /* words */ { + q := bleve.NewMatchQuery(opts.Keyword) + q.FieldVal = "Content" + q.Analyzer = repoIndexerAnalyzer + if searchMode == indexer.SearchModeFuzzy { + // this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case. + q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) + } else { + q.Operator = query.MatchQueryOperatorAnd + } + contentQuery = q } keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery) |