Allow code search by filename (#32210)

This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it). ![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858) Resolves #32096 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
author: Bruno Sofiato <bruno.sofiato@gmail.com> 2024-10-11 20:35:04 -0300
committer: GitHub <noreply@github.com> 2024-10-11 23:35:04 +0000
commit: 900ac622514081b90e08135cab175d5d1ea1bc9d (patch)
tree: 5b9fe1530a0ef54ae2abd24f9bf7a18b0fe4fa0e /modules
parent: 0fe5e2b08c311f26c3fc0fc71eb6abffb06bc182 (diff)
download: gitea-900ac622514081b90e08135cab175d5d1ea1bc9d.tar.gz
gitea-900ac622514081b90e08135cab175d5d1ea1bc9d.zip
9 files changed, 534 insertions, 40 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index c17f56d3cf..90e5e62bcb 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -17,6 +17,7 @@ import (
 	"code.gitea.io/gitea/modules/charset"
 	"code.gitea.io/gitea/modules/git"
 	"code.gitea.io/gitea/modules/gitrepo"
+	path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
 	"code.gitea.io/gitea/modules/indexer/code/internal"
 	indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
 	inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@@ -53,6 +54,7 @@ type RepoIndexerData struct {
 	RepoID    int64
 	CommitID  string
 	Content   string
+	Filename  string
 	Language  string
 	UpdatedAt time.Time
 }
@@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {
 
 const (
 	repoIndexerAnalyzer      = "repoIndexerAnalyzer"
+	filenameIndexerAnalyzer  = "filenameIndexerAnalyzer"
+	filenameIndexerTokenizer = "filenameIndexerTokenizer"
 	repoIndexerDocType       = "repoIndexerDocType"
-	repoIndexerLatestVersion = 6
+	repoIndexerLatestVersion = 7
 )
 
 // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
 	textFieldMapping.IncludeInAll = false
 	docMapping.AddFieldMappingsAt("Content", textFieldMapping)
 
+	fileNamedMapping := bleve.NewTextFieldMapping()
+	fileNamedMapping.IncludeInAll = false
+	fileNamedMapping.Analyzer = filenameIndexerAnalyzer
+	docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
+
 	termFieldMapping := bleve.NewTextFieldMapping()
 	termFieldMapping.IncludeInAll = false
 	termFieldMapping.Analyzer = analyzer_keyword.Name
@@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
 	docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
 
 	mapping := bleve.NewIndexMapping()
+
 	if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
 		return nil, err
 	} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
@@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
 	}); err != nil {
 		return nil, err
 	}
+
+	if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
+		"type":          analyzer_custom.Name,
+		"char_filters":  []string{},
+		"tokenizer":     unicode.Name,
+		"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
+	}); err != nil {
+		return nil, err
+	}
+
 	mapping.DefaultAnalyzer = repoIndexerAnalyzer
 	mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
 	mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
@@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
 	return batch.Index(id, &RepoIndexerData{
 		RepoID:    repo.ID,
 		CommitID:  commitSha,
+		Filename:  update.Filename,
 		Content:   string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
 		Language:  analyze.GetCodeLanguage(update.Filename, fileContents),
 		UpdatedAt: time.Now().UTC(),
@@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 		keywordQuery query.Query
 	)
 
-	phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
-	phraseQuery.FieldVal = "Content"
-	phraseQuery.Analyzer = repoIndexerAnalyzer
-	keywordQuery = phraseQuery
+	pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
+	pathQuery.FieldVal = "Filename"
+	pathQuery.SetBoost(10)
+
+	contentQuery := bleve.NewMatchQuery(opts.Keyword)
+	contentQuery.FieldVal = "Content"
+
 	if opts.IsKeywordFuzzy {
-		phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
+		contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
 	}
 
+	keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
+
 	if len(opts.RepoIDs) > 0 {
 		repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
 		for _, repoID := range opts.RepoIDs {
@@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 
 	from, pageSize := opts.GetSkipTake()
 	searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
-	searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+	searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
 	searchRequest.IncludeLocations = true
 
 	if len(opts.Language) == 0 {
@@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 				endIndex = locationEnd
 			}
 		}
+		if len(hit.Locations["Filename"]) > 0 {
+			startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
+		}
+
 		language := hit.Fields["Language"].(string)
 		var updatedUnix timeutil.TimeStamp
 		if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
diff --git a/modules/indexer/code/bleve/token/path/path.go b/modules/indexer/code/bleve/token/path/path.go
new file mode 100644
index 0000000000..107e0da109
--- /dev/null
+++ b/modules/indexer/code/bleve/token/path/path.go
@@ -0,0 +1,101 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+	"slices"
+	"strings"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const (
+	Name = "gitea/path"
+)
+
+type TokenFilter struct{}
+
+func NewTokenFilter() *TokenFilter {
+	return &TokenFilter{}
+}
+
+func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewTokenFilter(), nil
+}
+
+func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	if len(input) == 1 {
+		// if there is only one token, we dont need to generate the reversed chain
+		return generatePathTokens(input, false)
+	}
+
+	normal := generatePathTokens(input, false)
+	reversed := generatePathTokens(input, true)
+
+	return append(normal, reversed...)
+}
+
+// Generates path tokens from the input tokens.
+// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
+// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
+//
+// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
+// to efficiently search for filenames without supplying the fullpath.
+func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
+	terms := make([]string, 0, len(input))
+	longestTerm := 0
+
+	if reversed {
+		slices.Reverse(input)
+	}
+
+	for i := 0; i < len(input); i++ {
+		var sb strings.Builder
+		sb.WriteString(string(input[0].Term))
+
+		for j := 1; j < i; j++ {
+			sb.WriteString("/")
+			sb.WriteString(string(input[j].Term))
+		}
+
+		term := sb.String()
+
+		if longestTerm < len(term) {
+			longestTerm = len(term)
+		}
+
+		terms = append(terms, term)
+	}
+
+	output := make(analysis.TokenStream, 0, len(terms))
+
+	for _, term := range terms {
+		var start, end int
+
+		if reversed {
+			start = 0
+			end = len(term)
+		} else {
+			start = longestTerm - len(term)
+			end = longestTerm
+		}
+
+		token := analysis.Token{
+			Position: 1,
+			Start:    start,
+			End:      end,
+			Type:     analysis.AlphaNumeric,
+			Term:     []byte(term),
+		}
+
+		output = append(output, &token)
+	}
+
+	return output
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, TokenFilterConstructor)
+}
diff --git a/modules/indexer/code/bleve/token/path/path_test.go b/modules/indexer/code/bleve/token/path/path_test.go
new file mode 100644
index 0000000000..cc52021ef7
--- /dev/null
+++ b/modules/indexer/code/bleve/token/path/path_test.go
@@ -0,0 +1,76 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+	"github.com/stretchr/testify/assert"
+)
+
+type Scenario struct {
+	Input  string
+	Tokens []string
+}
+
+func TestTokenFilter(t *testing.T) {
+	scenarios := []struct {
+		Input string
+		Terms []string
+	}{
+		{
+			Input: "Dockerfile",
+			Terms: []string{"Dockerfile"},
+		},
+		{
+			Input: "Dockerfile.rootless",
+			Terms: []string{"Dockerfile.rootless"},
+		},
+		{
+			Input: "a/b/c/Dockerfile.rootless",
+			Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
+		},
+		{
+			Input: "",
+			Terms: []string{},
+		},
+	}
+
+	for _, scenario := range scenarios {
+		t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
+			terms := extractTerms(scenario.Input)
+
+			assert.Len(t, terms, len(scenario.Terms))
+
+			for _, term := range terms {
+				assert.Contains(t, scenario.Terms, term)
+			}
+		})
+	}
+}
+
+func extractTerms(input string) []string {
+	tokens := tokenize(input)
+	filteredTokens := filter(tokens)
+	terms := make([]string, 0, len(filteredTokens))
+
+	for _, token := range filteredTokens {
+		terms = append(terms, string(token.Term))
+	}
+
+	return terms
+}
+
+func filter(input analysis.TokenStream) analysis.TokenStream {
+	filter := NewTokenFilter()
+	return filter.Filter(input)
+}
+
+func tokenize(input string) analysis.TokenStream {
+	tokenizer := unicode.NewUnicodeTokenizer()
+	return tokenizer.Tokenize([]byte(input))
+}
diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go
index 5c01034450..669a1bafcc 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -30,7 +30,7 @@ import (
 )
 
 const (
-	esRepoIndexerLatestVersion = 1
+	esRepoIndexerLatestVersion = 2
 	// multi-match-types, currently only 2 types are used
 	// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
 	esMultiMatchTypeBestFields   = "best_fields"
@@ -57,12 +57,50 @@ func NewIndexer(url, indexerName string) *Indexer {
 
 const (
 	defaultMapping = `{
+		"settings": {
+    		"analysis": {
+      			"analyzer": {
+        			"filename_path_analyzer": {
+          				"tokenizer": "path_tokenizer"
+        			},
+        			"reversed_filename_path_analyzer": {
+          				"tokenizer": "reversed_path_tokenizer"
+        			}
+      			},
+				"tokenizer": {
+					"path_tokenizer": {
+						"type": "path_hierarchy",
+						"delimiter": "/"
+					},
+					"reversed_path_tokenizer": {
+						"type": "path_hierarchy",
+						"delimiter": "/",
+						"reverse": true
+					}
+				}
+			}
+  		},
 		"mappings": {
 			"properties": {
 				"repo_id": {
 					"type": "long",
 					"index": true
 				},
+				"filename": {
+					"type": "text",
+					"term_vector": "with_positions_offsets",
+					"index": true,
+					"fields": {
+         		  		"path": {
+            				"type": "text",
+            				"analyzer": "reversed_filename_path_analyzer"
+						},
+          				"path_reversed": {
+            				"type": "text",
+            				"analyzer": "filename_path_analyzer"
+          				}
+        			}
+				},
 				"content": {
 					"type": "text",
 					"term_vector": "with_positions_offsets",
@@ -136,6 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
 			Id(id).
 			Doc(map[string]any{
 				"repo_id":    repo.ID,
+				"filename":   update.Filename,
 				"content":    string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
 				"commit_id":  sha,
 				"language":   analyze.GetCodeLanguage(update.Filename, fileContents),
@@ -231,11 +270,11 @@ func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
 	return err
 }
 
-// indexPos find words positions for start and the following end on content. It will
+// contentMatchIndexPos find words positions for start and the following end on content. It will
 // return the beginning position of the first start and the ending position of the
 // first end following the start string.
 // If not found any of the positions, it will return -1, -1.
-func indexPos(content, start, end string) (int, int) {
+func contentMatchIndexPos(content, start, end string) (int, int) {
 	startIdx := strings.Index(content, start)
 	if startIdx < 0 {
 		return -1, -1
@@ -244,22 +283,29 @@ func indexPos(content, start, end string) (int, int) {
 	if endIdx < 0 {
 		return -1, -1
 	}
-	return startIdx, startIdx + len(start) + endIdx + len(end)
+	return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
 }
 
 func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
 	hits := make([]*internal.SearchResult, 0, pageSize)
 	for _, hit := range searchResult.Hits.Hits {
+		repoID, fileName := internal.ParseIndexerID(hit.Id)
+		res := make(map[string]any)
+		if err := json.Unmarshal(hit.Source, &res); err != nil {
+			return 0, nil, nil, err
+		}
+
 		// FIXME: There is no way to get the position the keyword on the content currently on the same request.
 		// So we get it from content, this may made the query slower. See
 		// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
 		var startIndex, endIndex int
-		c, ok := hit.Highlight["content"]
-		if ok && len(c) > 0 {
+		if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
+			startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
+		} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
 			// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
 			// now we should find the positions. But how to avoid html content which contains the
 			// <em> and </em> tags? If elastic search has handled that?
-			startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
+			startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
 			if startIndex == -1 {
 				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
 			}
@@ -267,12 +313,6 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
 			panic(fmt.Sprintf("2===%#v", hit.Highlight))
 		}
 
-		repoID, fileName := internal.ParseIndexerID(hit.Id)
-		res := make(map[string]any)
-		if err := json.Unmarshal(hit.Source, &res); err != nil {
-			return 0, nil, nil, err
-		}
-
 		language := res["language"].(string)
 
 		hits = append(hits, &internal.SearchResult{
@@ -283,7 +323,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
 			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
 			Language:    language,
 			StartIndex:  startIndex,
-			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
+			EndIndex:    endIndex,
 			Color:       enry.GetColor(language),
 		})
 	}
@@ -315,7 +355,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 		searchType = esMultiMatchTypeBestFields
 	}
 
-	kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType)
+	kwQuery := elastic.NewBoolQuery().Should(
+		elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType),
+		elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
+	)
 	query := elastic.NewBoolQuery()
 	query = query.Must(kwQuery)
 	if len(opts.RepoIDs) > 0 {
@@ -341,6 +384,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 			Highlight(
 				elastic.NewHighlight().
 					Field("content").
+					Field("filename").
 					NumOfFragments(0). // return all highting content on fragments
 					HighlighterType("fvh"),
 			).
@@ -373,6 +417,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 		Highlight(
 			elastic.NewHighlight().
 				Field("content").
+				Field("filename").
 				NumOfFragments(0). // return all highting content on fragments
 				HighlighterType("fvh"),
 		).
diff --git a/modules/indexer/code/elasticsearch/elasticsearch_test.go b/modules/indexer/code/elasticsearch/elasticsearch_test.go
index c6ba93e76d..a6d2af92b2 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch_test.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch_test.go
@@ -10,7 +10,7 @@ import (
 )
 
 func TestIndexPos(t *testing.T) {
-	startIdx, endIdx := indexPos("test index start and end", "start", "end")
+	startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
 	assert.EqualValues(t, 11, startIdx)
-	assert.EqualValues(t, 24, endIdx)
+	assert.EqualValues(t, 15, endIdx)
 }
diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go
index 8975c5ce40..5b33528dcd 100644
--- a/modules/indexer/code/indexer_test.go
+++ b/modules/indexer/code/indexer_test.go
@@ -6,6 +6,7 @@ package code
 import (
 	"context"
 	"os"
+	"slices"
 	"testing"
 
 	"code.gitea.io/gitea/models/db"
@@ -20,53 +21,166 @@ import (
 	_ "code.gitea.io/gitea/models/activities"
 
 	"github.com/stretchr/testify/assert"
+
+	_ "github.com/mattn/go-sqlite3"
 )
 
+type codeSearchResult struct {
+	Filename string
+	Content  string
+}
+
 func TestMain(m *testing.M) {
 	unittest.MainTest(m)
 }
 
 func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
 	t.Run(name, func(t *testing.T) {
-		var repoID int64 = 1
-		err := index(git.DefaultContext, indexer, repoID)
-		assert.NoError(t, err)
+		assert.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer))
+
 		keywords := []struct {
 			RepoIDs []int64
 			Keyword string
-			IDs     []int64
 			Langs   int
+			Results []codeSearchResult
 		}{
+			// Search for an exact match on the contents of a file
+			// This scenario yields a single result (the file README.md on the repo '1')
 			{
 				RepoIDs: nil,
 				Keyword: "Description",
-				IDs:     []int64{repoID},
 				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "README.md",
+						Content:  "# repo1\n\nDescription for repo1",
+					},
+				},
 			},
+			// Search for an exact match on the contents of a file within the repo '2'.
+			// This scenario yields no results
 			{
 				RepoIDs: []int64{2},
 				Keyword: "Description",
-				IDs:     []int64{},
 				Langs:   0,
 			},
+			// Search for an exact match on the contents of a file
+			// This scenario yields a single result (the file README.md on the repo '1')
 			{
 				RepoIDs: nil,
 				Keyword: "repo1",
-				IDs:     []int64{repoID},
 				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "README.md",
+						Content:  "# repo1\n\nDescription for repo1",
+					},
+				},
 			},
+			// Search for an exact match on the contents of a file within the repo '2'.
+			// This scenario yields no results
 			{
 				RepoIDs: []int64{2},
 				Keyword: "repo1",
-				IDs:     []int64{},
 				Langs:   0,
 			},
+			// Search for a non-existing term.
+			// This scenario yields no results
 			{
 				RepoIDs: nil,
 				Keyword: "non-exist",
-				IDs:     []int64{},
 				Langs:   0,
 			},
+			// Search for an exact match on the contents of a file within the repo '62'.
+			// This scenario yields a single result (the file avocado.md on the repo '62')
+			{
+				RepoIDs: []int64{62},
+				Keyword: "pineaple",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for an exact match on the filename within the repo '62'.
+			// This scenario yields a single result (the file avocado.md on the repo '62')
+			{
+				RepoIDs: []int64{62},
+				Keyword: "avocado.md",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for an partial match on the filename within the repo '62'.
+			// This scenario yields a single result (the file avocado.md on the repo '62')
+			{
+				RepoIDs: []int64{62},
+				Keyword: "avo",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for matches on both the contents and the filenames within the repo '62'.
+			// This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents
+			{
+				RepoIDs: []int64{62},
+				Keyword: "cucumber",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "cucumber.md",
+						Content:  "Salad is good for your health",
+					},
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for matches on the filenames within the repo '62'.
+			// This scenario yields two results (both are based on filename, the first one is an exact match)
+			{
+				RepoIDs: []int64{62},
+				Keyword: "ham",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "ham.md",
+						Content:  "This is also not cheese",
+					},
+					{
+						Filename: "potato/ham.md",
+						Content:  "This is not cheese",
+					},
+				},
+			},
+			// Search for matches on the contents of files within the repo '62'.
+			// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
+			{
+				RepoIDs: []int64{62},
+				Keyword: "This is not cheese",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "potato/ham.md",
+						Content:  "This is not cheese",
+					},
+					{
+						Filename: "ham.md",
+						Content:  "This is also not cheese",
+					},
+				},
+			},
 		}
 
 		for _, kw := range keywords {
@@ -81,19 +195,37 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
 					IsKeywordFuzzy: true,
 				})
 				assert.NoError(t, err)
-				assert.Len(t, kw.IDs, int(total))
 				assert.Len(t, langs, kw.Langs)
 
-				ids := make([]int64, 0, len(res))
+				hits := make([]codeSearchResult, 0, len(res))
+
+				if total > 0 {
+					assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
+				}
+
 				for _, hit := range res {
-					ids = append(ids, hit.RepoID)
-					assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content)
+					hits = append(hits, codeSearchResult{
+						Filename: hit.Filename,
+						Content:  hit.Content,
+					})
+				}
+
+				lastIndex := -1
+
+				for _, expected := range kw.Results {
+					index := slices.Index(hits, expected)
+					if index == -1 {
+						assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
+					} else if lastIndex > index {
+						assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
+					} else {
+						lastIndex = index
+					}
 				}
-				assert.EqualValues(t, kw.IDs, ids)
 			})
 		}
 
-		assert.NoError(t, indexer.Delete(context.Background(), repoID))
+		assert.NoError(t, tearDownRepositoryIndexes(indexer))
 	})
 }
 
@@ -136,3 +268,25 @@ func TestESIndexAndSearch(t *testing.T) {
 
 	testIndexer("elastic_search", t, indexer)
 }
+
+func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
+	for _, repoID := range repositoriesToSearch() {
+		if err := index(ctx, indexer, repoID); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func tearDownRepositoryIndexes(indexer internal.Indexer) error {
+	for _, repoID := range repositoriesToSearch() {
+		if err := indexer.Delete(context.Background(), repoID); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func repositoriesToSearch() []int64 {
+	return []int64{1, 62}
+}
diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go
index 689c4f4584..5b95783d9f 100644
--- a/modules/indexer/code/internal/util.go
+++ b/modules/indexer/code/internal/util.go
@@ -10,6 +10,10 @@ import (
 	"code.gitea.io/gitea/modules/log"
 )
 
+const (
+	filenameMatchNumberOfLines = 7 // Copied from github search
+)
+
 func FilenameIndexerID(repoID int64, filename string) string {
 	return internal.Base36(repoID) + "_" + filename
 }
@@ -30,3 +34,17 @@ func FilenameOfIndexerID(indexerID string) string {
 	}
 	return indexerID[index+1:]
 }
+
+// Given the contents of file, returns the boundaries of its first seven lines.
+func FilenameMatchIndexPos(content string) (int, int) {
+	count := 1
+	for i, c := range content {
+		if c == '\n' {
+			count++
+			if count == filenameMatchNumberOfLines {
+				return 0, i
+			}
+		}
+	}
+	return 0, len(content)
+}
diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go
index a2265f86e6..b426b39bc2 100644
--- a/modules/indexer/internal/bleve/util.go
+++ b/modules/indexer/internal/bleve/util.go
@@ -11,10 +11,15 @@ import (
 	"code.gitea.io/gitea/modules/util"
 
 	"github.com/blevesearch/bleve/v2"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
 	"github.com/blevesearch/bleve/v2/index/upsidedown"
 	"github.com/ethantkoenig/rupture"
 )
 
+const (
+	maxFuzziness = 2
+)
+
 // openIndexer open the index at the specified path, checking for metadata
 // updates and bleve version updates.  If index needs to be created (or
 // re-created), returns (nil, nil)
@@ -48,7 +53,27 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
 	return index, 0, nil
 }
 
+// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
+// may be different on two string and they still be considered equivalent.
+// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
 func GuessFuzzinessByKeyword(s string) int {
+	tokenizer := unicode.NewUnicodeTokenizer()
+	tokens := tokenizer.Tokenize([]byte(s))
+
+	if len(tokens) > 0 {
+		fuzziness := maxFuzziness
+
+		for _, token := range tokens {
+			fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
+		}
+
+		return fuzziness
+	}
+
+	return 0
+}
+
+func guessFuzzinessByKeyword(s string) int {
 	// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
 	// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
 	// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
@@ -57,5 +82,5 @@ func GuessFuzzinessByKeyword(s string) int {
 			return 0
 		}
 	}
-	return min(2, len(s)/4)
+	return min(maxFuzziness, len(s)/4)
 }
diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go
new file mode 100644
index 0000000000..ae0b12c08d
--- /dev/null
+++ b/modules/indexer/internal/bleve/util_test.go
@@ -0,0 +1,45 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package bleve
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
+	scenarios := []struct {
+		Input     string
+		Fuzziness int // See util.go for the definition of fuzziness in this particular context
+	}{
+		{
+			Input:     "",
+			Fuzziness: 0,
+		},
+		{
+			Input:     "Avocado",
+			Fuzziness: 1,
+		},
+		{
+			Input:     "Geschwindigkeit",
+			Fuzziness: 2,
+		},
+		{
+			Input:     "non-exist",
+			Fuzziness: 0,
+		},
+		{
+			Input:     "갃갃갃",
+			Fuzziness: 0,
+		},
+	}
+
+	for _, scenario := range scenarios {
+		t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
+			assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
+		})
+	}
+}
author	Bruno Sofiato <bruno.sofiato@gmail.com>	2024-10-11 20:35:04 -0300
committer	GitHub <noreply@github.com>	2024-10-11 23:35:04 +0000
commit	900ac622514081b90e08135cab175d5d1ea1bc9d (patch)
tree	5b9fe1530a0ef54ae2abd24f9bf7a18b0fe4fa0e /modules
parent	0fe5e2b08c311f26c3fc0fc71eb6abffb06bc182 (diff)
download	gitea-900ac622514081b90e08135cab175d5d1ea1bc9d.tar.gz gitea-900ac622514081b90e08135cab175d5d1ea1bc9d.zip