9 files changed, 534 insertions, 40 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index c17f56d3cf..90e5e62bcb 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -17,6 +17,7 @@ import (
 	"code.gitea.io/gitea/modules/charset"
 	"code.gitea.io/gitea/modules/git"
 	"code.gitea.io/gitea/modules/gitrepo"
+	path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
 	"code.gitea.io/gitea/modules/indexer/code/internal"
 	indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
 	inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@@ -53,6 +54,7 @@ type RepoIndexerData struct {
 	RepoID    int64
 	CommitID  string
 	Content   string
+	Filename  string
 	Language  string
 	UpdatedAt time.Time
 }
@@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {
 
 const (
 	repoIndexerAnalyzer      = "repoIndexerAnalyzer"
+	filenameIndexerAnalyzer  = "filenameIndexerAnalyzer"
+	filenameIndexerTokenizer = "filenameIndexerTokenizer"
 	repoIndexerDocType       = "repoIndexerDocType"
-	repoIndexerLatestVersion = 6
+	repoIndexerLatestVersion = 7
 )
 
 // generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
 	textFieldMapping.IncludeInAll = false
 	docMapping.AddFieldMappingsAt("Content", textFieldMapping)
 
+	fileNamedMapping := bleve.NewTextFieldMapping()
+	fileNamedMapping.IncludeInAll = false
+	fileNamedMapping.Analyzer = filenameIndexerAnalyzer
+	docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
+
 	termFieldMapping := bleve.NewTextFieldMapping()
 	termFieldMapping.IncludeInAll = false
 	termFieldMapping.Analyzer = analyzer_keyword.Name
@@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
 	docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
 
 	mapping := bleve.NewIndexMapping()
+
 	if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
 		return nil, err
 	} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
@@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
 	}); err != nil {
 		return nil, err
 	}
+
+	if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
+		"type":          analyzer_custom.Name,
+		"char_filters":  []string{},
+		"tokenizer":     unicode.Name,
+		"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
+	}); err != nil {
+		return nil, err
+	}
+
 	mapping.DefaultAnalyzer = repoIndexerAnalyzer
 	mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
 	mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
@@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
 	return batch.Index(id, &RepoIndexerData{
 		RepoID:    repo.ID,
 		CommitID:  commitSha,
+		Filename:  update.Filename,
 		Content:   string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
 		Language:  analyze.GetCodeLanguage(update.Filename, fileContents),
 		UpdatedAt: time.Now().UTC(),
@@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 		keywordQuery query.Query
 	)
 
-	phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
-	phraseQuery.FieldVal = "Content"
-	phraseQuery.Analyzer = repoIndexerAnalyzer
-	keywordQuery = phraseQuery
+	pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
+	pathQuery.FieldVal = "Filename"
+	pathQuery.SetBoost(10)
+
+	contentQuery := bleve.NewMatchQuery(opts.Keyword)
+	contentQuery.FieldVal = "Content"
+
 	if opts.IsKeywordFuzzy {
-		phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
+		contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
 	}
 
+	keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
+
 	if len(opts.RepoIDs) > 0 {
 		repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
 		for _, repoID := range opts.RepoIDs {
@@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 
 	from, pageSize := opts.GetSkipTake()
 	searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
-	searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+	searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
 	searchRequest.IncludeLocations = true
 
 	if len(opts.Language) == 0 {
@@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 				endIndex = locationEnd
 			}
 		}
+		if len(hit.Locations["Filename"]) > 0 {
+			startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
+		}
+
 		language := hit.Fields["Language"].(string)
 		var updatedUnix timeutil.TimeStamp
 		if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
diff --git a/modules/indexer/code/bleve/token/path/path.go b/modules/indexer/code/bleve/token/path/path.go
new file mode 100644
index 0000000000..107e0da109
--- /dev/null
+++ b/modules/indexer/code/bleve/token/path/path.go
@@ -0,0 +1,101 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+	"slices"
+	"strings"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const (
+	Name = "gitea/path"
+)
+
+type TokenFilter struct{}
+
+func NewTokenFilter() *TokenFilter {
+	return &TokenFilter{}
+}
+
+func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewTokenFilter(), nil
+}
+
+func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	if len(input) == 1 {
+		// if there is only one token, we dont need to generate the reversed chain
+		return generatePathTokens(input, false)
+	}
+
+	normal := generatePathTokens(input, false)
+	reversed := generatePathTokens(input, true)
+
+	return append(normal, reversed...)
+}
+
+// Generates path tokens from the input tokens.
+// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
+// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
+//
+// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
+// to efficiently search for filenames without supplying the fullpath.
+func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
+	terms := make([]string, 0, len(input))
+	longestTerm := 0
+
+	if reversed {
+		slices.Reverse(input)
+	}
+
+	for i := 0; i < len(input); i++ {
+		var sb strings.Builder
+		sb.WriteString(string(input[0].Term))
+
+		for j := 1; j < i; j++ {
+			sb.WriteString("/")
+			sb.WriteString(string(input[j].Term))
+		}
+
+		term := sb.String()
+
+		if longestTerm < len(term) {
+			longestTerm = len(term)
+		}
+
+		terms = append(terms, term)
+	}
+
+	output := make(analysis.TokenStream, 0, len(terms))
+
+	for _, term := range terms {
+		var start, end int
+
+		if reversed {
+			start = 0
+			end = len(term)
+		} else {
+			start = longestTerm - len(term)
+			end = longestTerm
+		}
+
+		token := analysis.Token{
+			Position: 1,
+			Start:    start,
+			End:      end,
+			Type:     analysis.AlphaNumeric,
+			Term:     []byte(term),
+		}
+
+		output = append(output, &token)
+	}
+
+	return output
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, TokenFilterConstructor)
+}
diff --git a/modules/indexer/code/bleve/token/path/path_test.go b/modules/indexer/code/bleve/token/path/path_test.go
new file mode 100644
index 0000000000..cc52021ef7
--- /dev/null
+++ b/modules/indexer/code/bleve/token/path/path_test.go
@@ -0,0 +1,76 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+	"github.com/stretchr/testify/assert"
+)
+
+type Scenario struct {
+	Input  string
+	Tokens []string
+}
+
+func TestTokenFilter(t *testing.T) {
+	scenarios := []struct {
+		Input string
+		Terms []string
+	}{
+		{
+			Input: "Dockerfile",
+			Terms: []string{"Dockerfile"},
+		},
+		{
+			Input: "Dockerfile.rootless",
+			Terms: []string{"Dockerfile.rootless"},
+		},
+		{
+			Input: "a/b/c/Dockerfile.rootless",
+			Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
+		},
+		{
+			Input: "",
+			Terms: []string{},
+		},
+	}
+
+	for _, scenario := range scenarios {
+		t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
+			terms := extractTerms(scenario.Input)
+
+			assert.Len(t, terms, len(scenario.Terms))
+
+			for _, term := range terms {
+				assert.Contains(t, scenario.Terms, term)
+			}
+		})
+	}
+}
+
+func extractTerms(input string) []string {
+	tokens := tokenize(input)
+	filteredTokens := filter(tokens)
+	terms := make([]string, 0, len(filteredTokens))
+
+	for _, token := range filteredTokens {
+		terms = append(terms, string(token.Term))
+	}
+
+	return terms
+}
+
+func filter(input analysis.TokenStream) analysis.TokenStream {
+	filter := NewTokenFilter()
+	return filter.Filter(input)
+}
+
+func tokenize(input string) analysis.TokenStream {
+	tokenizer := unicode.NewUnicodeTokenizer()
+	return tokenizer.Tokenize([]byte(input))
+}
diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go
index 5c01034450..669a1bafcc 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -30,7 +30,7 @@ import (
 )
 
 const (
-	esRepoIndexerLatestVersion = 1
+	esRepoIndexerLatestVersion = 2
 	// multi-match-types, currently only 2 types are used
 	// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
 	esMultiMatchTypeBestFields   = "best_fields"
@@ -57,12 +57,50 @@ func NewIndexer(url, indexerName string) *Indexer {
 
 const (
 	defaultMapping = `{
+		"settings": {
+    		"analysis": {
+      			"analyzer": {
+        			"filename_path_analyzer": {
+          				"tokenizer": "path_tokenizer"
+        			},
+        			"reversed_filename_path_analyzer": {
+          				"tokenizer": "reversed_path_tokenizer"
+        			}
+      			},
+				"tokenizer": {
+					"path_tokenizer": {
+						"type": "path_hierarchy",
+						"delimiter": "/"
+					},
+					"reversed_path_tokenizer": {
+						"type": "path_hierarchy",
+						"delimiter": "/",
+						"reverse": true
+					}
+				}
+			}
+  		},
 		"mappings": {
 			"properties": {
 				"repo_id": {
 					"type": "long",
 					"index": true
 				},
+				"filename": {
+					"type": "text",
+					"term_vector": "with_positions_offsets",
+					"index": true,
+					"fields": {
+         		  		"path": {
+            				"type": "text",
+            				"analyzer": "reversed_filename_path_analyzer"
+						},
+          				"path_reversed": {
+            				"type": "text",
+            				"analyzer": "filename_path_analyzer"
+          				}
+        			}
+				},
 				"content": {
 					"type": "text",
 					"term_vector": "with_positions_offsets",
@@ -136,6 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
 			Id(id).
 			Doc(map[string]any{
 				"repo_id":    repo.ID,
+				"filename":   update.Filename,
 				"content":    string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
 				"commit_id":  sha,
 				"language":   analyze.GetCodeLanguage(update.Filename, fileContents),
@@ -231,11 +270,11 @@ func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
 	return err
 }
 
-// indexPos find words positions for start and the following end on content. It will
+// contentMatchIndexPos find words positions for start and the following end on content. It will
 // return the beginning position of the first start and the ending position of the
 // first end following the start string.
 // If not found any of the positions, it will return -1, -1.
-func indexPos(content, start, end string) (int, int) {
+func contentMatchIndexPos(content, start, end string) (int, int) {
 	startIdx := strings.Index(content, start)
 	if startIdx < 0 {
 		return -1, -1
@@ -244,22 +283,29 @@ func indexPos(content, start, end string) (int, int) {
 	if endIdx < 0 {
 		return -1, -1
 	}
-	return startIdx, startIdx + len(start) + endIdx + len(end)
+	return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
 }
 
 func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
 	hits := make([]*internal.SearchResult, 0, pageSize)
 	for _, hit := range searchResult.Hits.Hits {
+		repoID, fileName := internal.ParseIndexerID(hit.Id)
+		res := make(map[string]any)
+		if err := json.Unmarshal(hit.Source, &res); err != nil {
+			return 0, nil, nil, err
+		}
+
 		// FIXME: There is no way to get the position the keyword on the content currently on the same request.
 		// So we get it from content, this may made the query slower. See
 		// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
 		var startIndex, endIndex int
-		c, ok := hit.Highlight["content"]
-		if ok && len(c) > 0 {
+		if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
+			startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
+		} else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
 			// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
 			// now we should find the positions. But how to avoid html content which contains the
 			// <em> and </em> tags? If elastic search has handled that?
-			startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
+			startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
 			if startIndex == -1 {
 				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
 			}
@@ -267,12 +313,6 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
 			panic(fmt.Sprintf("2===%#v", hit.Highlight))
 		}
 
-		repoID, fileName := internal.ParseIndexerID(hit.Id)
-		res := make(map[string]any)
-		if err := json.Unmarshal(hit.Source, &res); err != nil {
-			return 0, nil, nil, err
-		}
-
 		language := res["language"].(string)
 
 		hits = append(hits, &internal.SearchResult{
@@ -283,7 +323,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
 			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
 			Language:    language,
 			StartIndex:  startIndex,
-			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
+			EndIndex:    endIndex,
 			Color:       enry.GetColor(language),
 		})
 	}
@@ -315,7 +355,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 		searchType = esMultiMatchTypeBestFields
 	}
 
-	kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType)
+	kwQuery := elastic.NewBoolQuery().Should(
+		elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType),
+		elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
+	)
 	query := elastic.NewBoolQuery()
 	query = query.Must(kwQuery)
 	if len(opts.RepoIDs) > 0 {
@@ -341,6 +384,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 			Highlight(
 				elastic.NewHighlight().
 					Field("content").
+					Field("filename").
 					NumOfFragments(0). // return all highting content on fragments
 					HighlighterType("fvh"),
 			).
@@ -373,6 +417,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
 		Highlight(
 			elastic.NewHighlight().
 				Field("content").
+				Field("filename").
 				NumOfFragments(0). // return all highting content on fragments
 				HighlighterType("fvh"),
 		).
diff --git a/modules/indexer/code/elasticsearch/elasticsearch_test.go b/modules/indexer/code/elasticsearch/elasticsearch_test.go
index c6ba93e76d..a6d2af92b2 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch_test.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch_test.go
@@ -10,7 +10,7 @@ import (
 )
 
 func TestIndexPos(t *testing.T) {
-	startIdx, endIdx := indexPos("test index start and end", "start", "end")
+	startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
 	assert.EqualValues(t, 11, startIdx)
-	assert.EqualValues(t, 24, endIdx)
+	assert.EqualValues(t, 15, endIdx)
 }
diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go
index 8975c5ce40..5b33528dcd 100644
--- a/modules/indexer/code/indexer_test.go
+++ b/modules/indexer/code/indexer_test.go
@@ -6,6 +6,7 @@ package code
 import (
 	"context"
 	"os"
+	"slices"
 	"testing"
 
 	"code.gitea.io/gitea/models/db"
@@ -20,53 +21,166 @@ import (
 	_ "code.gitea.io/gitea/models/activities"
 
 	"github.com/stretchr/testify/assert"
+
+	_ "github.com/mattn/go-sqlite3"
 )
 
+type codeSearchResult struct {
+	Filename string
+	Content  string
+}
+
 func TestMain(m *testing.M) {
 	unittest.MainTest(m)
 }
 
 func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
 	t.Run(name, func(t *testing.T) {
-		var repoID int64 = 1
-		err := index(git.DefaultContext, indexer, repoID)
-		assert.NoError(t, err)
+		assert.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer))
+
 		keywords := []struct {
 			RepoIDs []int64
 			Keyword string
-			IDs     []int64
 			Langs   int
+			Results []codeSearchResult
 		}{
+			// Search for an exact match on the contents of a file
+			// This scenario yields a single result (the file README.md on the repo '1')
 			{
 				RepoIDs: nil,
 				Keyword: "Description",
-				IDs:     []int64{repoID},
 				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "README.md",
+						Content:  "# repo1\n\nDescription for repo1",
+					},
+				},
 			},
+			// Search for an exact match on the contents of a file within the repo '2'.
+			// This scenario yields no results
 			{
 				RepoIDs: []int64{2},
 				Keyword: "Description",
-				IDs:     []int64{},
 				Langs:   0,
 			},
+			// Search for an exact match on the contents of a file
+			// This scenario yields a single result (the file README.md on the repo '1')
 			{
 				RepoIDs: nil,
 				Keyword: "repo1",
-				IDs:     []int64{repoID},
 				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "README.md",
+						Content:  "# repo1\n\nDescription for repo1",
+					},
+				},
 			},
+			// Search for an exact match on the contents of a file within the repo '2'.
+			// This scenario yields no results
 			{
 				RepoIDs: []int64{2},
 				Keyword: "repo1",
-				IDs:     []int64{},
 				Langs:   0,
 			},
+			// Search for a non-existing term.
+			// This scenario yields no results
 			{
 				RepoIDs: nil,
 				Keyword: "non-exist",
-				IDs:     []int64{},
 				Langs:   0,
 			},
+			// Search for an exact match on the contents of a file within the repo '62'.
+			// This scenario yields a single result (the file avocado.md on the repo '62')
+			{
+				RepoIDs: []int64{62},
+				Keyword: "pineaple",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for an exact match on the filename within the repo '62'.
+			// This scenario yields a single result (the file avocado.md on the repo '62')
+			{
+				RepoIDs: []int64{62},
+				Keyword: "avocado.md",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for an partial match on the filename within the repo '62'.
+			// This scenario yields a single result (the file avocado.md on the repo '62')
+			{
+				RepoIDs: []int64{62},
+				Keyword: "avo",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for matches on both the contents and the filenames within the repo '62'.
+			// This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents
+			{
+				RepoIDs: []int64{62},
+				Keyword: "cucumber",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "cucumber.md",
+						Content:  "Salad is good for your health",
+					},
+					{
+						Filename: "avocado.md",
+						Content:  "# repo1\n\npineaple pie of cucumber juice",
+					},
+				},
+			},
+			// Search for matches on the filenames within the repo '62'.
+			// This scenario yields two results (both are based on filename, the first one is an exact match)
+			{
+				RepoIDs: []int64{62},
+				Keyword: "ham",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "ham.md",
+						Content:  "This is also not cheese",
+					},
+					{
+						Filename: "potato/ham.md",
+						Content:  "This is not cheese",
+					},
+				},
+			},
+			// Search for matches on the contents of files within the repo '62'.
+			// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
+			{
+				RepoIDs: []int64{62},
+				Keyword: "This is not cheese",
+				Langs:   1,
+				Results: []codeSearchResult{
+					{
+						Filename: "potato/ham.md",
+						Content:  "This is not cheese",
+					},
+					{
+						Filename: "ham.md",
+						Content:  "This is also not cheese",
+					},
+				},
+			},
 		}
 
 		for _, kw := range keywords {
@@ -81,19 +195,37 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
 					IsKeywordFuzzy: true,
 				})
 				assert.NoError(t, err)
-				assert.Len(t, kw.IDs, int(total))
 				assert.Len(t, langs, kw.Langs)
 
-				ids := make([]int64, 0, len(res))
+				hits := make([]codeSearchResult, 0, len(res))
+
+				if total > 0 {
+					assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
+				}
+
 				for _, hit := range res {
-					ids = append(ids, hit.RepoID)
-					assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content)
+					hits = append(hits, codeSearchResult{
+						Filename: hit.Filename,
+						Content:  hit.Content,
+					})
+				}
+
+				lastIndex := -1
+
+				for _, expected := range kw.Results {
+					index := slices.Index(hits, expected)
+					if index == -1 {
+						assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
+					} else if lastIndex > index {
+						assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
+					} else {
+						lastIndex = index
+					}
 				}
-				assert.EqualValues(t, kw.IDs, ids)
 			})
 		}
 
-		assert.NoError(t, indexer.Delete(context.Background(), repoID))
+		assert.NoError(t, tearDownRepositoryIndexes(indexer))
 	})
 }
 
@@ -136,3 +268,25 @@ func TestESIndexAndSearch(t *testing.T) {
 
 	testIndexer("elastic_search", t, indexer)
 }
+
+func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
+	for _, repoID := range repositoriesToSearch() {
+		if err := index(ctx, indexer, repoID); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func tearDownRepositoryIndexes(indexer internal.Indexer) error {
+	for _, repoID := range repositoriesToSearch() {
+		if err := indexer.Delete(context.Background(), repoID); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func repositoriesToSearch() []int64 {
+	return []int64{1, 62}
+}
diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go
index 689c4f4584..5b95783d9f 100644
--- a/modules/indexer/code/internal/util.go
+++ b/modules/indexer/code/internal/util.go
@@ -10,6 +10,10 @@ import (
 	"code.gitea.io/gitea/modules/log"
 )
 
+const (
+	filenameMatchNumberOfLines = 7 // Copied from github search
+)
+
 func FilenameIndexerID(repoID int64, filename string) string {
 	return internal.Base36(repoID) + "_" + filename
 }
@@ -30,3 +34,17 @@ func FilenameOfIndexerID(indexerID string) string {
 	}
 	return indexerID[index+1:]
 }
+
+// Given the contents of file, returns the boundaries of its first seven lines.
+func FilenameMatchIndexPos(content string) (int, int) {
+	count := 1
+	for i, c := range content {
+		if c == '\n' {
+			count++
+			if count == filenameMatchNumberOfLines {
+				return 0, i
+			}
+		}
+	}
+	return 0, len(content)
+}
diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go
index a2265f86e6..b426b39bc2 100644
--- a/modules/indexer/internal/bleve/util.go
+++ b/modules/indexer/internal/bleve/util.go
@@ -11,10 +11,15 @@ import (
 	"code.gitea.io/gitea/modules/util"
 
 	"github.com/blevesearch/bleve/v2"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
 	"github.com/blevesearch/bleve/v2/index/upsidedown"
 	"github.com/ethantkoenig/rupture"
 )
 
+const (
+	maxFuzziness = 2
+)
+
 // openIndexer open the index at the specified path, checking for metadata
 // updates and bleve version updates.  If index needs to be created (or
 // re-created), returns (nil, nil)
@@ -48,7 +53,27 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
 	return index, 0, nil
 }
 
+// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
+// may be different on two string and they still be considered equivalent.
+// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
 func GuessFuzzinessByKeyword(s string) int {
+	tokenizer := unicode.NewUnicodeTokenizer()
+	tokens := tokenizer.Tokenize([]byte(s))
+
+	if len(tokens) > 0 {
+		fuzziness := maxFuzziness
+
+		for _, token := range tokens {
+			fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
+		}
+
+		return fuzziness
+	}
+
+	return 0
+}
+
+func guessFuzzinessByKeyword(s string) int {
 	// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
 	// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
 	// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
@@ -57,5 +82,5 @@ func GuessFuzzinessByKeyword(s string) int {
 			return 0
 		}
 	}
-	return min(2, len(s)/4)
+	return min(maxFuzziness, len(s)/4)
 }
diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go
new file mode 100644
index 0000000000..ae0b12c08d
--- /dev/null
+++ b/modules/indexer/internal/bleve/util_test.go
@@ -0,0 +1,45 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package bleve
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
+	scenarios := []struct {
+		Input     string
+		Fuzziness int // See util.go for the definition of fuzziness in this particular context
+	}{
+		{
+			Input:     "",
+			Fuzziness: 0,
+		},
+		{
+			Input:     "Avocado",
+			Fuzziness: 1,
+		},
+		{
+			Input:     "Geschwindigkeit",
+			Fuzziness: 2,
+		},
+		{
+			Input:     "non-exist",
+			Fuzziness: 0,
+		},
+		{
+			Input:     "갃갃갃",
+			Fuzziness: 0,
+		},
+	}
+
+	for _, scenario := range scenarios {
+		t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
+			assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
+		})
+	}
+}