summaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
author6543 <m.huber@kithara.com>2024-03-09 02:39:27 +0100
committerGitHub <noreply@github.com>2024-03-09 01:39:27 +0000
commit7fdc0481538151d8a5ed3ec2a32639950f5d8ac6 (patch)
treece93a89a4fde28a2ae063d95f302eea9d4d003fe /modules
parentbaeb2511741aa70d24a48fd46db936b52be9d9dd (diff)
downloadgitea-7fdc0481538151d8a5ed3ec2a32639950f5d8ac6.tar.gz
gitea-7fdc0481538151d8a5ed3ec2a32639950f5d8ac6.zip
Patch in exact search for meilisearch (#29671)
meilisearch does not have an search option to contorl fuzzynes per query right now: - https://github.com/meilisearch/meilisearch/issues/1192 - https://github.com/orgs/meilisearch/discussions/377 - https://github.com/meilisearch/meilisearch/discussions/1096 so we have to create a workaround by post-filter the search result in gitea until this is addressed. For future works I added an option in backend only atm, to enable fuzzynes for issue indexer too. And also refactored the code so the fuzzy option is equal in logic to code indexer --- *Sponsored by Kithara Software GmbH*
Diffstat (limited to 'modules')
-rw-r--r--modules/indexer/code/bleve/bleve.go12
-rw-r--r--modules/indexer/code/elasticsearch/elasticsearch.go8
-rw-r--r--modules/indexer/code/indexer_test.go2
-rw-r--r--modules/indexer/code/internal/indexer.go4
-rw-r--r--modules/indexer/code/search.go5
-rw-r--r--modules/indexer/internal/bleve/query.go7
-rw-r--r--modules/indexer/issues/bleve/bleve.go17
-rw-r--r--modules/indexer/issues/elasticsearch/elasticsearch.go12
-rw-r--r--modules/indexer/issues/internal/model.go2
-rw-r--r--modules/indexer/issues/meilisearch/meilisearch.go91
-rw-r--r--modules/indexer/issues/meilisearch/meilisearch_test.go45
11 files changed, 178 insertions, 27 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index 8ba50ed77c..107dd23598 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {
// Search searches for files in the specified repo.
// Returns the matching file-paths
-func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var (
indexerQuery query.Query
keywordQuery query.Query
)
- if isMatch {
- prefixQuery := bleve.NewPrefixQuery(keyword)
- prefixQuery.FieldVal = "Content"
- keywordQuery = prefixQuery
- } else {
+ if isFuzzy {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
+ } else {
+ prefixQuery := bleve.NewPrefixQuery(keyword)
+ prefixQuery.FieldVal = "Content"
+ keywordQuery = prefixQuery
}
if len(repoIDs) > 0 {
diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go
index 0f70f13485..065b0b2061 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
}
// Search searches for codes and language stats by given conditions.
-func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
- searchType := esMultiMatchTypeBestFields
- if isMatch {
- searchType = esMultiMatchTypePhrasePrefix
+func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+ searchType := esMultiMatchTypePhrasePrefix
+ if isFuzzy {
+ searchType = esMultiMatchTypeBestFields
}
kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go
index 5eb8e61e3d..23dbd63410 100644
--- a/modules/indexer/code/indexer_test.go
+++ b/modules/indexer/code/indexer_test.go
@@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) {
- total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false)
+ total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true)
assert.NoError(t, err)
assert.Len(t, kw.IDs, int(total))
assert.Len(t, langs, kw.Langs)
diff --git a/modules/indexer/code/internal/indexer.go b/modules/indexer/code/internal/indexer.go
index da3ac3623c..c92419deb2 100644
--- a/modules/indexer/code/internal/indexer.go
+++ b/modules/indexer/code/internal/indexer.go
@@ -16,7 +16,7 @@ type Indexer interface {
internal.Indexer
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error
- Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
+ Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
}
// NewDummyIndexer returns a dummy indexer
@@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
return fmt.Errorf("indexer is not ready")
}
-func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
+func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
return 0, nil, nil, fmt.Errorf("indexer is not ready")
}
diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go
index 2ddc2397fa..89a62a8d3e 100644
--- a/modules/indexer/code/search.go
+++ b/modules/indexer/code/search.go
@@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
}
// PerformSearch perform a search on a repository
-func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
+// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
+func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
if len(keyword) == 0 {
return 0, nil, nil, nil
}
- total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch)
+ total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy)
if err != nil {
return 0, nil, nil, err
}
diff --git a/modules/indexer/internal/bleve/query.go b/modules/indexer/internal/bleve/query.go
index c7d66538c1..2a427c4020 100644
--- a/modules/indexer/internal/bleve/query.go
+++ b/modules/indexer/internal/bleve/query.go
@@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue
return q
}
+// PrefixQuery generates a match prefix query for the given prefix and field
+func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
+ q := bleve.NewPrefixQuery(matchPrefix)
+ q.FieldVal = field
+ return q
+}
+
// BoolFieldQuery generates a bool field query for the given value and field
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
q := bleve.NewBoolFieldQuery(value)
diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go
index 6a5d65cb66..aaea854efa 100644
--- a/modules/indexer/issues/bleve/bleve.go
+++ b/modules/indexer/issues/bleve/bleve.go
@@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
var queries []query.Query
if options.Keyword != "" {
- keywordQueries := []query.Query{
- inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
- inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
- inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
+ if options.IsFuzzyKeyword {
+ queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
+ inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
+ inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
+ inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
+ }...))
+ } else {
+ queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
+ inner_bleve.PrefixQuery(options.Keyword, "title"),
+ inner_bleve.PrefixQuery(options.Keyword, "content"),
+ inner_bleve.PrefixQuery(options.Keyword, "comments"),
+ }...))
}
- queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...))
}
if len(options.RepoIDs) > 0 || options.AllPublic {
diff --git a/modules/indexer/issues/elasticsearch/elasticsearch.go b/modules/indexer/issues/elasticsearch/elasticsearch.go
index 3acd3ade71..0077da263a 100644
--- a/modules/indexer/issues/elasticsearch/elasticsearch.go
+++ b/modules/indexer/issues/elasticsearch/elasticsearch.go
@@ -19,6 +19,10 @@ import (
const (
issueIndexerLatestVersion = 1
+ // multi-match-types, currently only 2 types are used
+ // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
+ esMultiMatchTypeBestFields = "best_fields"
+ esMultiMatchTypePhrasePrefix = "phrase_prefix"
)
var _ internal.Indexer = &Indexer{}
@@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
query := elastic.NewBoolQuery()
if options.Keyword != "" {
- query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))
+
+ searchType := esMultiMatchTypePhrasePrefix
+ if options.IsFuzzyKeyword {
+ searchType = esMultiMatchTypeBestFields
+ }
+
+ query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
}
if len(options.RepoIDs) > 0 {
diff --git a/modules/indexer/issues/internal/model.go b/modules/indexer/issues/internal/model.go
index 947335d8ce..d41fec4aba 100644
--- a/modules/indexer/issues/internal/model.go
+++ b/modules/indexer/issues/internal/model.go
@@ -74,6 +74,8 @@ type SearchResult struct {
type SearchOptions struct {
Keyword string // keyword to search
+ IsFuzzyKeyword bool // if false the levenshtein distance is 0
+
RepoIDs []int64 // repository IDs which the issues belong to
AllPublic bool // if include all public repositories
diff --git a/modules/indexer/issues/meilisearch/meilisearch.go b/modules/indexer/issues/meilisearch/meilisearch.go
index 325883196b..c429920065 100644
--- a/modules/indexer/issues/meilisearch/meilisearch.go
+++ b/modules/indexer/issues/meilisearch/meilisearch.go
@@ -5,6 +5,7 @@ package meilisearch
import (
"context"
+ "errors"
"strconv"
"strings"
@@ -16,12 +17,15 @@ import (
)
const (
- issueIndexerLatestVersion = 2
+ issueIndexerLatestVersion = 3
// TODO: make this configurable if necessary
maxTotalHits = 10000
)
+// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
+var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")
+
var _ internal.Indexer = &Indexer{}
// Indexer implements Indexer interface
@@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer {
},
DisplayedAttributes: []string{
"id",
+ "title",
+ "content",
+ "comments",
},
FilterableAttributes: []string{
"repo_id",
@@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
return nil, err
}
- hits := make([]internal.Match, 0, len(searchRes.Hits))
- for _, hit := range searchRes.Hits {
- hits = append(hits, internal.Match{
- ID: int64(hit.(map[string]any)["id"].(float64)),
- })
+ hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
+ if err != nil {
+ return nil, err
}
return &internal.SearchResult{
@@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string {
}
return field + ":asc"
}
+
+// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
+// and you can only change "typo tolerance" per index. So we have to post-filter the results
+// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
+// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
+func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
+ hits := make([]internal.Match, 0, len(searchRes.Hits))
+ for _, hit := range searchRes.Hits {
+ hit, ok := hit.(map[string]any)
+ if !ok {
+ return nil, ErrMalformedResponse
+ }
+
+ if !isFuzzy {
+ keyword = strings.ToLower(keyword)
+
+ // declare a anon func to check if the title, content or at least one comment contains the keyword
+ found, err := func() (bool, error) {
+ // check if title match first
+ title, ok := hit["title"].(string)
+ if !ok {
+ return false, ErrMalformedResponse
+ } else if strings.Contains(strings.ToLower(title), keyword) {
+ return true, nil
+ }
+
+ // check if content has a match
+ content, ok := hit["content"].(string)
+ if !ok {
+ return false, ErrMalformedResponse
+ } else if strings.Contains(strings.ToLower(content), keyword) {
+ return true, nil
+ }
+
+ // now check for each comment if one has a match
+ // so we first try to cast and skip if there are no comments
+ comments, ok := hit["comments"].([]any)
+ if !ok {
+ return false, ErrMalformedResponse
+ } else if len(comments) == 0 {
+ return false, nil
+ }
+
+ // now we iterate over all and report as soon as we detect one match
+ for i := range comments {
+ comment, ok := comments[i].(string)
+ if !ok {
+ return false, ErrMalformedResponse
+ }
+ if strings.Contains(strings.ToLower(comment), keyword) {
+ return true, nil
+ }
+ }
+
+ // we got no match
+ return false, nil
+ }()
+
+ if err != nil {
+ return nil, err
+ } else if !found {
+ continue
+ }
+ }
+ issueID, ok := hit["id"].(float64)
+ if !ok {
+ return nil, ErrMalformedResponse
+ }
+ hits = append(hits, internal.Match{
+ ID: int64(issueID),
+ })
+ }
+ return hits, nil
+}
diff --git a/modules/indexer/issues/meilisearch/meilisearch_test.go b/modules/indexer/issues/meilisearch/meilisearch_test.go
index 3d7237268e..ecce704236 100644
--- a/modules/indexer/issues/meilisearch/meilisearch_test.go
+++ b/modules/indexer/issues/meilisearch/meilisearch_test.go
@@ -10,7 +10,11 @@ import (
"testing"
"time"
+ "code.gitea.io/gitea/modules/indexer/issues/internal"
"code.gitea.io/gitea/modules/indexer/issues/internal/tests"
+
+ "github.com/meilisearch/meilisearch-go"
+ "github.com/stretchr/testify/assert"
)
func TestMeilisearchIndexer(t *testing.T) {
@@ -48,3 +52,44 @@ func TestMeilisearchIndexer(t *testing.T) {
tests.TestIndexer(t, indexer)
}
+
+func TestNonFuzzyWorkaround(t *testing.T) {
+ // get unexpected return
+ _, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{
+ Hits: []any{"aa", "bb", "cc", "dd"},
+ }, "bowling", false)
+ assert.ErrorIs(t, err, ErrMalformedResponse)
+
+ validResponse := &meilisearch.SearchResponse{
+ Hits: []any{
+ map[string]any{
+ "id": float64(11),
+ "title": "a title",
+ "content": "issue body with no match",
+ "comments": []any{"hey whats up?", "I'm currently bowling", "nice"},
+ },
+ map[string]any{
+ "id": float64(22),
+ "title": "Bowling as title",
+ "content": "",
+ "comments": []any{},
+ },
+ map[string]any{
+ "id": float64(33),
+ "title": "Bowl-ing as fuzzy match",
+ "content": "",
+ "comments": []any{},
+ },
+ },
+ }
+
+ // nonFuzzy
+ hits, err := nonFuzzyWorkaround(validResponse, "bowling", false)
+ assert.NoError(t, err)
+ assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits)
+
+ // fuzzy
+ hits, err = nonFuzzyWorkaround(validResponse, "bowling", true)
+ assert.NoError(t, err)
+ assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
+}