]> source.dussan.org Git - gitea.git/commitdiff
Patch in exact search for meilisearch (#29671)
author6543 <m.huber@kithara.com>
Sat, 9 Mar 2024 01:39:27 +0000 (02:39 +0100)
committerGitHub <noreply@github.com>
Sat, 9 Mar 2024 01:39:27 +0000 (01:39 +0000)
meilisearch does not have an search option to contorl fuzzynes per query
right now:
 - https://github.com/meilisearch/meilisearch/issues/1192
 - https://github.com/orgs/meilisearch/discussions/377
 - https://github.com/meilisearch/meilisearch/discussions/1096

so we have to create a workaround by post-filter the search result in
gitea until this is addressed.

For future works I added an option in backend only atm, to enable
fuzzynes for issue indexer too.
And also refactored the code so the fuzzy option is equal in logic to
code indexer

---
*Sponsored by Kithara Software GmbH*

14 files changed:
modules/indexer/code/bleve/bleve.go
modules/indexer/code/elasticsearch/elasticsearch.go
modules/indexer/code/indexer_test.go
modules/indexer/code/internal/indexer.go
modules/indexer/code/search.go
modules/indexer/internal/bleve/query.go
modules/indexer/issues/bleve/bleve.go
modules/indexer/issues/elasticsearch/elasticsearch.go
modules/indexer/issues/internal/model.go
modules/indexer/issues/meilisearch/meilisearch.go
modules/indexer/issues/meilisearch/meilisearch_test.go
routers/web/explore/code.go
routers/web/repo/search.go
routers/web/user/code.go

index 8ba50ed77c9380bdbeac0257c5e8cb526e11a980..107dd23598d1bd2bf116fdb6e130cca3baf8ad40 100644 (file)
@@ -233,21 +233,21 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {
 
 // Search searches for files in the specified repo.
 // Returns the matching file-paths
-func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
        var (
                indexerQuery query.Query
                keywordQuery query.Query
        )
 
-       if isMatch {
-               prefixQuery := bleve.NewPrefixQuery(keyword)
-               prefixQuery.FieldVal = "Content"
-               keywordQuery = prefixQuery
-       } else {
+       if isFuzzy {
                phraseQuery := bleve.NewMatchPhraseQuery(keyword)
                phraseQuery.FieldVal = "Content"
                phraseQuery.Analyzer = repoIndexerAnalyzer
                keywordQuery = phraseQuery
+       } else {
+               prefixQuery := bleve.NewPrefixQuery(keyword)
+               prefixQuery.FieldVal = "Content"
+               keywordQuery = prefixQuery
        }
 
        if len(repoIDs) > 0 {
index 0f70f1348552c1e54839a415eb178fa6e935d6f1..065b0b20618e7e09dbe69e4a8b58ead7df58710f 100644 (file)
@@ -281,10 +281,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
 }
 
 // Search searches for codes and language stats by given conditions.
-func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
-       searchType := esMultiMatchTypeBestFields
-       if isMatch {
-               searchType = esMultiMatchTypePhrasePrefix
+func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
+       searchType := esMultiMatchTypePhrasePrefix
+       if isFuzzy {
+               searchType = esMultiMatchTypeBestFields
        }
 
        kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
index 5eb8e61e3db4d27bb2d7f85766d159bdeb4afb8b..23dbd63410541a397e6c9cbee17e2b572e42be93 100644 (file)
@@ -70,7 +70,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
 
                for _, kw := range keywords {
                        t.Run(kw.Keyword, func(t *testing.T) {
-                               total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, false)
+                               total, res, langs, err := indexer.Search(context.TODO(), kw.RepoIDs, "", kw.Keyword, 1, 10, true)
                                assert.NoError(t, err)
                                assert.Len(t, kw.IDs, int(total))
                                assert.Len(t, langs, kw.Langs)
index da3ac3623c92f173f30aa661f87e5a2b7df00fb1..c92419deb22f7e879159854333be2aeca7c09233 100644 (file)
@@ -16,7 +16,7 @@ type Indexer interface {
        internal.Indexer
        Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
        Delete(ctx context.Context, repoID int64) error
-       Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
+       Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error)
 }
 
 // NewDummyIndexer returns a dummy indexer
@@ -38,6 +38,6 @@ func (d *dummyIndexer) Delete(ctx context.Context, repoID int64) error {
        return fmt.Errorf("indexer is not ready")
 }
 
-func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
+func (d *dummyIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
        return 0, nil, nil, fmt.Errorf("indexer is not ready")
 }
index 2ddc2397fa1917d52126f5fc4cfcf72c77469094..89a62a8d3e2ddc0b3bf1aa110954dfed0f8a88ee 100644 (file)
@@ -124,12 +124,13 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
 }
 
 // PerformSearch perform a search on a repository
-func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
+// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
+func PerformSearch(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int, []*Result, []*internal.SearchResultLanguages, error) {
        if len(keyword) == 0 {
                return 0, nil, nil, nil
        }
 
-       total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isMatch)
+       total, results, resultLanguages, err := (*globalIndexer.Load()).Search(ctx, repoIDs, language, keyword, page, pageSize, isFuzzy)
        if err != nil {
                return 0, nil, nil, err
        }
index c7d66538c1263e6e3a972acb01cc5365b52ec5fd..2a427c402026a0112a3185f5106d8bed7ae06674 100644 (file)
@@ -25,6 +25,13 @@ func MatchPhraseQuery(matchPhrase, field, analyzer string) *query.MatchPhraseQue
        return q
 }
 
+// PrefixQuery generates a match prefix query for the given prefix and field
+func PrefixQuery(matchPrefix, field string) *query.PrefixQuery {
+       q := bleve.NewPrefixQuery(matchPrefix)
+       q.FieldVal = field
+       return q
+}
+
 // BoolFieldQuery generates a bool field query for the given value and field
 func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
        q := bleve.NewBoolFieldQuery(value)
index 6a5d65cb665d0ba2355fafd9a0d3512070f0b0c1..aaea854efa03128327f8dda423b1cd24f813a248 100644 (file)
@@ -156,12 +156,19 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
        var queries []query.Query
 
        if options.Keyword != "" {
-               keywordQueries := []query.Query{
-                       inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
-                       inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
-                       inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
+               if options.IsFuzzyKeyword {
+                       queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
+                               inner_bleve.MatchPhraseQuery(options.Keyword, "title", issueIndexerAnalyzer),
+                               inner_bleve.MatchPhraseQuery(options.Keyword, "content", issueIndexerAnalyzer),
+                               inner_bleve.MatchPhraseQuery(options.Keyword, "comments", issueIndexerAnalyzer),
+                       }...))
+               } else {
+                       queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
+                               inner_bleve.PrefixQuery(options.Keyword, "title"),
+                               inner_bleve.PrefixQuery(options.Keyword, "content"),
+                               inner_bleve.PrefixQuery(options.Keyword, "comments"),
+                       }...))
                }
-               queries = append(queries, bleve.NewDisjunctionQuery(keywordQueries...))
        }
 
        if len(options.RepoIDs) > 0 || options.AllPublic {
index 3acd3ade715280badf7382fe3132ea497664af13..0077da263a7cc92bd16aa87862221c640c4fff95 100644 (file)
@@ -19,6 +19,10 @@ import (
 
 const (
        issueIndexerLatestVersion = 1
+       // multi-match-types, currently only 2 types are used
+       // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
+       esMultiMatchTypeBestFields   = "best_fields"
+       esMultiMatchTypePhrasePrefix = "phrase_prefix"
 )
 
 var _ internal.Indexer = &Indexer{}
@@ -141,7 +145,13 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
        query := elastic.NewBoolQuery()
 
        if options.Keyword != "" {
-               query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments"))
+
+               searchType := esMultiMatchTypePhrasePrefix
+               if options.IsFuzzyKeyword {
+                       searchType = esMultiMatchTypeBestFields
+               }
+
+               query.Must(elastic.NewMultiMatchQuery(options.Keyword, "title", "content", "comments").Type(searchType))
        }
 
        if len(options.RepoIDs) > 0 {
index 947335d8ce9cb24fa453e983c6e960ebb49bfb10..d41fec4aba87dcdbec019753e3c836d0dd8d437f 100644 (file)
@@ -74,6 +74,8 @@ type SearchResult struct {
 type SearchOptions struct {
        Keyword string // keyword to search
 
+       IsFuzzyKeyword bool // if false the levenshtein distance is 0
+
        RepoIDs   []int64 // repository IDs which the issues belong to
        AllPublic bool    // if include all public repositories
 
index 325883196bb27c5d24b2dd0cee41045d905821ed..c429920065308a41ec40554b0d3176aa7b769bd4 100644 (file)
@@ -5,6 +5,7 @@ package meilisearch
 
 import (
        "context"
+       "errors"
        "strconv"
        "strings"
 
@@ -16,12 +17,15 @@ import (
 )
 
 const (
-       issueIndexerLatestVersion = 2
+       issueIndexerLatestVersion = 3
 
        // TODO: make this configurable if necessary
        maxTotalHits = 10000
 )
 
+// ErrMalformedResponse is never expected as we initialize the indexer ourself and so define the types.
+var ErrMalformedResponse = errors.New("meilisearch returned unexpected malformed content")
+
 var _ internal.Indexer = &Indexer{}
 
 // Indexer implements Indexer interface
@@ -47,6 +51,9 @@ func NewIndexer(url, apiKey, indexerName string) *Indexer {
                },
                DisplayedAttributes: []string{
                        "id",
+                       "title",
+                       "content",
+                       "comments",
                },
                FilterableAttributes: []string{
                        "repo_id",
@@ -221,11 +228,9 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
                return nil, err
        }
 
-       hits := make([]internal.Match, 0, len(searchRes.Hits))
-       for _, hit := range searchRes.Hits {
-               hits = append(hits, internal.Match{
-                       ID: int64(hit.(map[string]any)["id"].(float64)),
-               })
+       hits, err := nonFuzzyWorkaround(searchRes, options.Keyword, options.IsFuzzyKeyword)
+       if err != nil {
+               return nil, err
        }
 
        return &internal.SearchResult{
@@ -241,3 +246,77 @@ func parseSortBy(sortBy internal.SortBy) string {
        }
        return field + ":asc"
 }
+
+// nonFuzzyWorkaround is needed as meilisearch does not have an exact search
+// and you can only change "typo tolerance" per index. So we have to post-filter the results
+// https://www.meilisearch.com/docs/learn/configuration/typo_tolerance#configuring-typo-tolerance
+// TODO: remove once https://github.com/orgs/meilisearch/discussions/377 is addressed
+func nonFuzzyWorkaround(searchRes *meilisearch.SearchResponse, keyword string, isFuzzy bool) ([]internal.Match, error) {
+       hits := make([]internal.Match, 0, len(searchRes.Hits))
+       for _, hit := range searchRes.Hits {
+               hit, ok := hit.(map[string]any)
+               if !ok {
+                       return nil, ErrMalformedResponse
+               }
+
+               if !isFuzzy {
+                       keyword = strings.ToLower(keyword)
+
+                       // declare a anon func to check if the title, content or at least one comment contains the keyword
+                       found, err := func() (bool, error) {
+                               // check if title match first
+                               title, ok := hit["title"].(string)
+                               if !ok {
+                                       return false, ErrMalformedResponse
+                               } else if strings.Contains(strings.ToLower(title), keyword) {
+                                       return true, nil
+                               }
+
+                               // check if content has a match
+                               content, ok := hit["content"].(string)
+                               if !ok {
+                                       return false, ErrMalformedResponse
+                               } else if strings.Contains(strings.ToLower(content), keyword) {
+                                       return true, nil
+                               }
+
+                               // now check for each comment if one has a match
+                               // so we first try to cast and skip if there are no comments
+                               comments, ok := hit["comments"].([]any)
+                               if !ok {
+                                       return false, ErrMalformedResponse
+                               } else if len(comments) == 0 {
+                                       return false, nil
+                               }
+
+                               // now we iterate over all and report as soon as we detect one match
+                               for i := range comments {
+                                       comment, ok := comments[i].(string)
+                                       if !ok {
+                                               return false, ErrMalformedResponse
+                                       }
+                                       if strings.Contains(strings.ToLower(comment), keyword) {
+                                               return true, nil
+                                       }
+                               }
+
+                               // we got no match
+                               return false, nil
+                       }()
+
+                       if err != nil {
+                               return nil, err
+                       } else if !found {
+                               continue
+                       }
+               }
+               issueID, ok := hit["id"].(float64)
+               if !ok {
+                       return nil, ErrMalformedResponse
+               }
+               hits = append(hits, internal.Match{
+                       ID: int64(issueID),
+               })
+       }
+       return hits, nil
+}
index 3d7237268e1bdf718e0316bfbc01454ff7a06203..ecce704236b992bd19c9d8baef32ed5bb4e8d446 100644 (file)
@@ -10,7 +10,11 @@ import (
        "testing"
        "time"
 
+       "code.gitea.io/gitea/modules/indexer/issues/internal"
        "code.gitea.io/gitea/modules/indexer/issues/internal/tests"
+
+       "github.com/meilisearch/meilisearch-go"
+       "github.com/stretchr/testify/assert"
 )
 
 func TestMeilisearchIndexer(t *testing.T) {
@@ -48,3 +52,44 @@ func TestMeilisearchIndexer(t *testing.T) {
 
        tests.TestIndexer(t, indexer)
 }
+
+func TestNonFuzzyWorkaround(t *testing.T) {
+       // get unexpected return
+       _, err := nonFuzzyWorkaround(&meilisearch.SearchResponse{
+               Hits: []any{"aa", "bb", "cc", "dd"},
+       }, "bowling", false)
+       assert.ErrorIs(t, err, ErrMalformedResponse)
+
+       validResponse := &meilisearch.SearchResponse{
+               Hits: []any{
+                       map[string]any{
+                               "id":       float64(11),
+                               "title":    "a title",
+                               "content":  "issue body with no match",
+                               "comments": []any{"hey whats up?", "I'm currently bowling", "nice"},
+                       },
+                       map[string]any{
+                               "id":       float64(22),
+                               "title":    "Bowling as title",
+                               "content":  "",
+                               "comments": []any{},
+                       },
+                       map[string]any{
+                               "id":       float64(33),
+                               "title":    "Bowl-ing as fuzzy match",
+                               "content":  "",
+                               "comments": []any{},
+                       },
+               },
+       }
+
+       // nonFuzzy
+       hits, err := nonFuzzyWorkaround(validResponse, "bowling", false)
+       assert.NoError(t, err)
+       assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}}, hits)
+
+       // fuzzy
+       hits, err = nonFuzzyWorkaround(validResponse, "bowling", true)
+       assert.NoError(t, err)
+       assert.EqualValues(t, []internal.Match{{ID: 11}, {ID: 22}, {ID: 33}}, hits)
+}
index 2cde8b655ee7b2ac41d9515a552802cb7d281771..a6bc71ac9cde06e6183e23e505f744a2ad3637fe 100644 (file)
@@ -35,7 +35,7 @@ func Code(ctx *context.Context) {
        keyword := ctx.FormTrim("q")
 
        queryType := ctx.FormTrim("t")
-       isMatch := queryType == "match"
+       isFuzzy := queryType != "match"
 
        ctx.Data["Keyword"] = keyword
        ctx.Data["Language"] = language
@@ -77,7 +77,7 @@ func Code(ctx *context.Context) {
        )
 
        if (len(repoIDs) > 0) || isAdmin {
-               total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
+               total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
                if err != nil {
                        if code_indexer.IsAvailable(ctx) {
                                ctx.ServerError("SearchResults", err)
index c53d8fd918a1cf68ecf4bca112bd4087896f5f3e..766dd5726aa8d46fc668c625d9e3ee7532e04880 100644 (file)
@@ -25,7 +25,7 @@ func Search(ctx *context.Context) {
        keyword := ctx.FormTrim("q")
 
        queryType := ctx.FormTrim("t")
-       isMatch := queryType == "match"
+       isFuzzy := queryType != "match"
 
        ctx.Data["Keyword"] = keyword
        ctx.Data["Language"] = language
@@ -43,7 +43,7 @@ func Search(ctx *context.Context) {
        }
 
        total, searchResults, searchResultLanguages, err := code_indexer.PerformSearch(ctx, []int64{ctx.Repo.Repository.ID},
-               language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
+               language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
        if err != nil {
                if code_indexer.IsAvailable(ctx) {
                        ctx.ServerError("SearchResults", err)
index eb711b76ebb0808c65a064508ef7d6040f8268b2..8613d38b65a66e9cff71082af7800ea0ab978952 100644 (file)
@@ -40,7 +40,7 @@ func CodeSearch(ctx *context.Context) {
        keyword := ctx.FormTrim("q")
 
        queryType := ctx.FormTrim("t")
-       isMatch := queryType == "match"
+       isFuzzy := queryType != "match"
 
        ctx.Data["Keyword"] = keyword
        ctx.Data["Language"] = language
@@ -75,7 +75,7 @@ func CodeSearch(ctx *context.Context) {
        )
 
        if len(repoIDs) > 0 {
-               total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isMatch)
+               total, searchResults, searchResultLanguages, err = code_indexer.PerformSearch(ctx, repoIDs, language, keyword, page, setting.UI.RepoSearchPagingNum, isFuzzy)
                if err != nil {
                        if code_indexer.IsAvailable(ctx) {
                                ctx.ServerError("SearchResults", err)