aboutsummaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
Diffstat (limited to 'modules')
-rw-r--r--modules/indexer/code/bleve/bleve.go44
-rw-r--r--modules/indexer/code/bleve/token/path/path.go101
-rw-r--r--modules/indexer/code/bleve/token/path/path_test.go76
-rw-r--r--modules/indexer/code/elasticsearch/elasticsearch.go75
-rw-r--r--modules/indexer/code/elasticsearch/elasticsearch_test.go4
-rw-r--r--modules/indexer/code/indexer_test.go184
-rw-r--r--modules/indexer/code/internal/util.go18
-rw-r--r--modules/indexer/internal/bleve/util.go27
-rw-r--r--modules/indexer/internal/bleve/util_test.go45
9 files changed, 534 insertions, 40 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index c17f56d3cf..90e5e62bcb 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
+ path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@@ -53,6 +54,7 @@ type RepoIndexerData struct {
RepoID int64
CommitID string
Content string
+ Filename string
Language string
UpdatedAt time.Time
}
@@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
+ filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
+ filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType"
- repoIndexerLatestVersion = 6
+ repoIndexerLatestVersion = 7
)
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
+ fileNamedMapping := bleve.NewTextFieldMapping()
+ fileNamedMapping.IncludeInAll = false
+ fileNamedMapping.Analyzer = filenameIndexerAnalyzer
+ docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
+
termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
@@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping()
+
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
@@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
}); err != nil {
return nil, err
}
+
+ if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
+ "type": analyzer_custom.Name,
+ "char_filters": []string{},
+ "tokenizer": unicode.Name,
+ "token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
+ }); err != nil {
+ return nil, err
+ }
+
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
@@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
+ Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
@@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
keywordQuery query.Query
)
- phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
- phraseQuery.FieldVal = "Content"
- phraseQuery.Analyzer = repoIndexerAnalyzer
- keywordQuery = phraseQuery
+ pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
+ pathQuery.FieldVal = "Filename"
+ pathQuery.SetBoost(10)
+
+ contentQuery := bleve.NewMatchQuery(opts.Keyword)
+ contentQuery.FieldVal = "Content"
+
if opts.IsKeywordFuzzy {
- phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
+ contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}
+ keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
+
if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
@@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
- searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+ searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(opts.Language) == 0 {
@@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
endIndex = locationEnd
}
}
+ if len(hit.Locations["Filename"]) > 0 {
+ startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
+ }
+
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
diff --git a/modules/indexer/code/bleve/token/path/path.go b/modules/indexer/code/bleve/token/path/path.go
new file mode 100644
index 0000000000..107e0da109
--- /dev/null
+++ b/modules/indexer/code/bleve/token/path/path.go
@@ -0,0 +1,101 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+ "slices"
+ "strings"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/registry"
+)
+
+const (
+ Name = "gitea/path"
+)
+
+type TokenFilter struct{}
+
+func NewTokenFilter() *TokenFilter {
+ return &TokenFilter{}
+}
+
+func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
+ return NewTokenFilter(), nil
+}
+
+func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+ if len(input) == 1 {
+ // if there is only one token, we dont need to generate the reversed chain
+ return generatePathTokens(input, false)
+ }
+
+ normal := generatePathTokens(input, false)
+ reversed := generatePathTokens(input, true)
+
+ return append(normal, reversed...)
+}
+
+// Generates path tokens from the input tokens.
+// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
+// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
+//
+// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
+// to efficiently search for filenames without supplying the fullpath.
+func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
+ terms := make([]string, 0, len(input))
+ longestTerm := 0
+
+ if reversed {
+ slices.Reverse(input)
+ }
+
+ for i := 0; i < len(input); i++ {
+ var sb strings.Builder
+ sb.WriteString(string(input[0].Term))
+
+ for j := 1; j < i; j++ {
+ sb.WriteString("/")
+ sb.WriteString(string(input[j].Term))
+ }
+
+ term := sb.String()
+
+ if longestTerm < len(term) {
+ longestTerm = len(term)
+ }
+
+ terms = append(terms, term)
+ }
+
+ output := make(analysis.TokenStream, 0, len(terms))
+
+ for _, term := range terms {
+ var start, end int
+
+ if reversed {
+ start = 0
+ end = len(term)
+ } else {
+ start = longestTerm - len(term)
+ end = longestTerm
+ }
+
+ token := analysis.Token{
+ Position: 1,
+ Start: start,
+ End: end,
+ Type: analysis.AlphaNumeric,
+ Term: []byte(term),
+ }
+
+ output = append(output, &token)
+ }
+
+ return output
+}
+
+func init() {
+ registry.RegisterTokenFilter(Name, TokenFilterConstructor)
+}
diff --git a/modules/indexer/code/bleve/token/path/path_test.go b/modules/indexer/code/bleve/token/path/path_test.go
new file mode 100644
index 0000000000..cc52021ef7
--- /dev/null
+++ b/modules/indexer/code/bleve/token/path/path_test.go
@@ -0,0 +1,76 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+ "github.com/stretchr/testify/assert"
+)
+
+type Scenario struct {
+ Input string
+ Tokens []string
+}
+
+func TestTokenFilter(t *testing.T) {
+ scenarios := []struct {
+ Input string
+ Terms []string
+ }{
+ {
+ Input: "Dockerfile",
+ Terms: []string{"Dockerfile"},
+ },
+ {
+ Input: "Dockerfile.rootless",
+ Terms: []string{"Dockerfile.rootless"},
+ },
+ {
+ Input: "a/b/c/Dockerfile.rootless",
+ Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
+ },
+ {
+ Input: "",
+ Terms: []string{},
+ },
+ }
+
+ for _, scenario := range scenarios {
+ t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
+ terms := extractTerms(scenario.Input)
+
+ assert.Len(t, terms, len(scenario.Terms))
+
+ for _, term := range terms {
+ assert.Contains(t, scenario.Terms, term)
+ }
+ })
+ }
+}
+
+func extractTerms(input string) []string {
+ tokens := tokenize(input)
+ filteredTokens := filter(tokens)
+ terms := make([]string, 0, len(filteredTokens))
+
+ for _, token := range filteredTokens {
+ terms = append(terms, string(token.Term))
+ }
+
+ return terms
+}
+
+func filter(input analysis.TokenStream) analysis.TokenStream {
+ filter := NewTokenFilter()
+ return filter.Filter(input)
+}
+
+func tokenize(input string) analysis.TokenStream {
+ tokenizer := unicode.NewUnicodeTokenizer()
+ return tokenizer.Tokenize([]byte(input))
+}
diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go
index 5c01034450..669a1bafcc 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch.go
@@ -30,7 +30,7 @@ import (
)
const (
- esRepoIndexerLatestVersion = 1
+ esRepoIndexerLatestVersion = 2
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
@@ -57,12 +57,50 @@ func NewIndexer(url, indexerName string) *Indexer {
const (
defaultMapping = `{
+ "settings": {
+ "analysis": {
+ "analyzer": {
+ "filename_path_analyzer": {
+ "tokenizer": "path_tokenizer"
+ },
+ "reversed_filename_path_analyzer": {
+ "tokenizer": "reversed_path_tokenizer"
+ }
+ },
+ "tokenizer": {
+ "path_tokenizer": {
+ "type": "path_hierarchy",
+ "delimiter": "/"
+ },
+ "reversed_path_tokenizer": {
+ "type": "path_hierarchy",
+ "delimiter": "/",
+ "reverse": true
+ }
+ }
+ }
+ },
"mappings": {
"properties": {
"repo_id": {
"type": "long",
"index": true
},
+ "filename": {
+ "type": "text",
+ "term_vector": "with_positions_offsets",
+ "index": true,
+ "fields": {
+ "path": {
+ "type": "text",
+ "analyzer": "reversed_filename_path_analyzer"
+ },
+ "path_reversed": {
+ "type": "text",
+ "analyzer": "filename_path_analyzer"
+ }
+ }
+ },
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
@@ -136,6 +174,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
Id(id).
Doc(map[string]any{
"repo_id": repo.ID,
+ "filename": update.Filename,
"content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
@@ -231,11 +270,11 @@ func (b *Indexer) doDelete(ctx context.Context, repoID int64) error {
return err
}
-// indexPos find words positions for start and the following end on content. It will
+// contentMatchIndexPos find words positions for start and the following end on content. It will
// return the beginning position of the first start and the ending position of the
// first end following the start string.
// If not found any of the positions, it will return -1, -1.
-func indexPos(content, start, end string) (int, int) {
+func contentMatchIndexPos(content, start, end string) (int, int) {
startIdx := strings.Index(content, start)
if startIdx < 0 {
return -1, -1
@@ -244,22 +283,29 @@ func indexPos(content, start, end string) (int, int) {
if endIdx < 0 {
return -1, -1
}
- return startIdx, startIdx + len(start) + endIdx + len(end)
+ return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
}
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
hits := make([]*internal.SearchResult, 0, pageSize)
for _, hit := range searchResult.Hits.Hits {
+ repoID, fileName := internal.ParseIndexerID(hit.Id)
+ res := make(map[string]any)
+ if err := json.Unmarshal(hit.Source, &res); err != nil {
+ return 0, nil, nil, err
+ }
+
// FIXME: There is no way to get the position the keyword on the content currently on the same request.
// So we get it from content, this may made the query slower. See
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
var startIndex, endIndex int
- c, ok := hit.Highlight["content"]
- if ok && len(c) > 0 {
+ if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
+ startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
+ } else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
// now we should find the positions. But how to avoid html content which contains the
// <em> and </em> tags? If elastic search has handled that?
- startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
+ startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
if startIndex == -1 {
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
}
@@ -267,12 +313,6 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
panic(fmt.Sprintf("2===%#v", hit.Highlight))
}
- repoID, fileName := internal.ParseIndexerID(hit.Id)
- res := make(map[string]any)
- if err := json.Unmarshal(hit.Source, &res); err != nil {
- return 0, nil, nil, err
- }
-
language := res["language"].(string)
hits = append(hits, &internal.SearchResult{
@@ -283,7 +323,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
Language: language,
StartIndex: startIndex,
- EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data
+ EndIndex: endIndex,
Color: enry.GetColor(language),
})
}
@@ -315,7 +355,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
searchType = esMultiMatchTypeBestFields
}
- kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType)
+ kwQuery := elastic.NewBoolQuery().Should(
+ elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType),
+ elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
+ )
query := elastic.NewBoolQuery()
query = query.Must(kwQuery)
if len(opts.RepoIDs) > 0 {
@@ -341,6 +384,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
Highlight(
elastic.NewHighlight().
Field("content").
+ Field("filename").
NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"),
).
@@ -373,6 +417,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
Highlight(
elastic.NewHighlight().
Field("content").
+ Field("filename").
NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"),
).
diff --git a/modules/indexer/code/elasticsearch/elasticsearch_test.go b/modules/indexer/code/elasticsearch/elasticsearch_test.go
index c6ba93e76d..a6d2af92b2 100644
--- a/modules/indexer/code/elasticsearch/elasticsearch_test.go
+++ b/modules/indexer/code/elasticsearch/elasticsearch_test.go
@@ -10,7 +10,7 @@ import (
)
func TestIndexPos(t *testing.T) {
- startIdx, endIdx := indexPos("test index start and end", "start", "end")
+ startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
assert.EqualValues(t, 11, startIdx)
- assert.EqualValues(t, 24, endIdx)
+ assert.EqualValues(t, 15, endIdx)
}
diff --git a/modules/indexer/code/indexer_test.go b/modules/indexer/code/indexer_test.go
index 8975c5ce40..5b33528dcd 100644
--- a/modules/indexer/code/indexer_test.go
+++ b/modules/indexer/code/indexer_test.go
@@ -6,6 +6,7 @@ package code
import (
"context"
"os"
+ "slices"
"testing"
"code.gitea.io/gitea/models/db"
@@ -20,53 +21,166 @@ import (
_ "code.gitea.io/gitea/models/activities"
"github.com/stretchr/testify/assert"
+
+ _ "github.com/mattn/go-sqlite3"
)
+type codeSearchResult struct {
+ Filename string
+ Content string
+}
+
func TestMain(m *testing.M) {
unittest.MainTest(m)
}
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
t.Run(name, func(t *testing.T) {
- var repoID int64 = 1
- err := index(git.DefaultContext, indexer, repoID)
- assert.NoError(t, err)
+ assert.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer))
+
keywords := []struct {
RepoIDs []int64
Keyword string
- IDs []int64
Langs int
+ Results []codeSearchResult
}{
+ // Search for an exact match on the contents of a file
+ // This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "Description",
- IDs: []int64{repoID},
Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "README.md",
+ Content: "# repo1\n\nDescription for repo1",
+ },
+ },
},
+ // Search for an exact match on the contents of a file within the repo '2'.
+ // This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "Description",
- IDs: []int64{},
Langs: 0,
},
+ // Search for an exact match on the contents of a file
+ // This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "repo1",
- IDs: []int64{repoID},
Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "README.md",
+ Content: "# repo1\n\nDescription for repo1",
+ },
+ },
},
+ // Search for an exact match on the contents of a file within the repo '2'.
+ // This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "repo1",
- IDs: []int64{},
Langs: 0,
},
+ // Search for a non-existing term.
+ // This scenario yields no results
{
RepoIDs: nil,
Keyword: "non-exist",
- IDs: []int64{},
Langs: 0,
},
+ // Search for an exact match on the contents of a file within the repo '62'.
+ // This scenario yields a single result (the file avocado.md on the repo '62')
+ {
+ RepoIDs: []int64{62},
+ Keyword: "pineaple",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for an exact match on the filename within the repo '62'.
+ // This scenario yields a single result (the file avocado.md on the repo '62')
+ {
+ RepoIDs: []int64{62},
+ Keyword: "avocado.md",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for an partial match on the filename within the repo '62'.
+ // This scenario yields a single result (the file avocado.md on the repo '62')
+ {
+ RepoIDs: []int64{62},
+ Keyword: "avo",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for matches on both the contents and the filenames within the repo '62'.
+ // This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents
+ {
+ RepoIDs: []int64{62},
+ Keyword: "cucumber",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "cucumber.md",
+ Content: "Salad is good for your health",
+ },
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for matches on the filenames within the repo '62'.
+ // This scenario yields two results (both are based on filename, the first one is an exact match)
+ {
+ RepoIDs: []int64{62},
+ Keyword: "ham",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "ham.md",
+ Content: "This is also not cheese",
+ },
+ {
+ Filename: "potato/ham.md",
+ Content: "This is not cheese",
+ },
+ },
+ },
+ // Search for matches on the contents of files within the repo '62'.
+ // This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
+ {
+ RepoIDs: []int64{62},
+ Keyword: "This is not cheese",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "potato/ham.md",
+ Content: "This is not cheese",
+ },
+ {
+ Filename: "ham.md",
+ Content: "This is also not cheese",
+ },
+ },
+ },
}
for _, kw := range keywords {
@@ -81,19 +195,37 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
IsKeywordFuzzy: true,
})
assert.NoError(t, err)
- assert.Len(t, kw.IDs, int(total))
assert.Len(t, langs, kw.Langs)
- ids := make([]int64, 0, len(res))
+ hits := make([]codeSearchResult, 0, len(res))
+
+ if total > 0 {
+ assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
+ }
+
for _, hit := range res {
- ids = append(ids, hit.RepoID)
- assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content)
+ hits = append(hits, codeSearchResult{
+ Filename: hit.Filename,
+ Content: hit.Content,
+ })
+ }
+
+ lastIndex := -1
+
+ for _, expected := range kw.Results {
+ index := slices.Index(hits, expected)
+ if index == -1 {
+ assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
+ } else if lastIndex > index {
+ assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
+ } else {
+ lastIndex = index
+ }
}
- assert.EqualValues(t, kw.IDs, ids)
})
}
- assert.NoError(t, indexer.Delete(context.Background(), repoID))
+ assert.NoError(t, tearDownRepositoryIndexes(indexer))
})
}
@@ -136,3 +268,25 @@ func TestESIndexAndSearch(t *testing.T) {
testIndexer("elastic_search", t, indexer)
}
+
+func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
+ for _, repoID := range repositoriesToSearch() {
+ if err := index(ctx, indexer, repoID); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func tearDownRepositoryIndexes(indexer internal.Indexer) error {
+ for _, repoID := range repositoriesToSearch() {
+ if err := indexer.Delete(context.Background(), repoID); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func repositoriesToSearch() []int64 {
+ return []int64{1, 62}
+}
diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go
index 689c4f4584..5b95783d9f 100644
--- a/modules/indexer/code/internal/util.go
+++ b/modules/indexer/code/internal/util.go
@@ -10,6 +10,10 @@ import (
"code.gitea.io/gitea/modules/log"
)
+const (
+ filenameMatchNumberOfLines = 7 // Copied from github search
+)
+
func FilenameIndexerID(repoID int64, filename string) string {
return internal.Base36(repoID) + "_" + filename
}
@@ -30,3 +34,17 @@ func FilenameOfIndexerID(indexerID string) string {
}
return indexerID[index+1:]
}
+
+// Given the contents of file, returns the boundaries of its first seven lines.
+func FilenameMatchIndexPos(content string) (int, int) {
+ count := 1
+ for i, c := range content {
+ if c == '\n' {
+ count++
+ if count == filenameMatchNumberOfLines {
+ return 0, i
+ }
+ }
+ }
+ return 0, len(content)
+}
diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go
index a2265f86e6..b426b39bc2 100644
--- a/modules/indexer/internal/bleve/util.go
+++ b/modules/indexer/internal/bleve/util.go
@@ -11,10 +11,15 @@ import (
"code.gitea.io/gitea/modules/util"
"github.com/blevesearch/bleve/v2"
+ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/index/upsidedown"
"github.com/ethantkoenig/rupture"
)
+const (
+ maxFuzziness = 2
+)
+
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
@@ -48,7 +53,27 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil
}
+// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
+// may be different on two string and they still be considered equivalent.
+// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int {
+ tokenizer := unicode.NewUnicodeTokenizer()
+ tokens := tokenizer.Tokenize([]byte(s))
+
+ if len(tokens) > 0 {
+ fuzziness := maxFuzziness
+
+ for _, token := range tokens {
+ fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
+ }
+
+ return fuzziness
+ }
+
+ return 0
+}
+
+func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
@@ -57,5 +82,5 @@ func GuessFuzzinessByKeyword(s string) int {
return 0
}
}
- return min(2, len(s)/4)
+ return min(maxFuzziness, len(s)/4)
}
diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go
new file mode 100644
index 0000000000..ae0b12c08d
--- /dev/null
+++ b/modules/indexer/internal/bleve/util_test.go
@@ -0,0 +1,45 @@
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package bleve
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
+ scenarios := []struct {
+ Input string
+ Fuzziness int // See util.go for the definition of fuzziness in this particular context
+ }{
+ {
+ Input: "",
+ Fuzziness: 0,
+ },
+ {
+ Input: "Avocado",
+ Fuzziness: 1,
+ },
+ {
+ Input: "Geschwindigkeit",
+ Fuzziness: 2,
+ },
+ {
+ Input: "non-exist",
+ Fuzziness: 0,
+ },
+ {
+ Input: "갃갃갃",
+ Fuzziness: 0,
+ },
+ }
+
+ for _, scenario := range scenarios {
+ t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
+ assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
+ })
+ }
+}