type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810
+
+-
+ id: 108
+ repo_id: 62
+ type: 1
+ config: "{}"
+ created_unix: 946684810
+
+-
+ id: 109
+ repo_id: 62
+ type: 2
+ config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}"
+ created_unix: 946684810
+
+-
+ id: 110
+ repo_id: 62
+ type: 3
+ config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
+ created_unix: 946684810
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false
+
+-
+ id: 62
+ owner_id: 42
+ owner_name: org42
+ lower_name: search-by-path
+ name: search-by-path
+ default_branch: master
+ num_watches: 0
+ num_stars: 0
+ num_forks: 0
+ num_issues: 0
+ num_closed_issues: 0
+ num_pulls: 0
+ num_closed_pulls: 0
+ num_milestones: 0
+ num_closed_milestones: 0
+ num_projects: 0
+ num_closed_projects: 0
+ is_private: false
+ is_empty: false
+ is_archived: false
+ is_mirror: false
+ status: 0
+ is_fork: false
+ fork_id: 0
+ is_template: false
+ template_id: 0
+ size: 0
+ is_fsck_enabled: true
+ close_issues_via_commit_in_any_branch: false
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false
+
+-
+ id: 42
+ lower_name: org42
+ name: org42
+ full_name: Org42
+ email: org42@example.com
+ keep_email_private: false
+ email_notifications_preference: onmention
+ passwd: ZogKvWdyEx:password
+ passwd_hash_algo: dummy
+ must_change_password: false
+ login_source: 0
+ login_name: org42
+ type: 1
+ salt: ZogKvWdyEx
+ max_repo_creation: -1
+ is_active: false
+ is_admin: false
+ is_restricted: false
+ allow_git_hook: false
+ allow_import_local: false
+ allow_create_organization: true
+ prohibit_login: false
+ avatar: avatar42
+ avatar_email: org42@example.com
+ use_custom_avatar: false
+ num_followers: 0
+ num_following: 0
+ num_stars: 0
+ num_repos: 1
+ num_teams: 0
+ num_members: 0
+ visibility: 0
+ repo_admin_change_team_access: false
+ theme: ""
+ keep_activity_private: false
{
name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)},
- count: 33,
+ count: 34,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)},
- count: 38,
+ count: 39,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName",
{
name: "AllPublic/PublicRepositoriesOfOrganization",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)},
- count: 33,
+ count: 34,
},
{
name: "AllTemplates",
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}},
[]int64{26, 41})
- testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
+ testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
+ []int64{42})
+
+ testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}},
[]int64{})
// test users
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
+ path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
RepoID int64
CommitID string
Content string
+ Filename string
Language string
UpdatedAt time.Time
}
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
+ filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
+ filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType"
- repoIndexerLatestVersion = 6
+ repoIndexerLatestVersion = 7
)
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
+ fileNamedMapping := bleve.NewTextFieldMapping()
+ fileNamedMapping.IncludeInAll = false
+ fileNamedMapping.Analyzer = filenameIndexerAnalyzer
+ docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
+
termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping()
+
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
}); err != nil {
return nil, err
}
+
+ if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
+ "type": analyzer_custom.Name,
+ "char_filters": []string{},
+ "tokenizer": unicode.Name,
+ "token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
+ }); err != nil {
+ return nil, err
+ }
+
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
+ Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
keywordQuery query.Query
)
- phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
- phraseQuery.FieldVal = "Content"
- phraseQuery.Analyzer = repoIndexerAnalyzer
- keywordQuery = phraseQuery
+ pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
+ pathQuery.FieldVal = "Filename"
+ pathQuery.SetBoost(10)
+
+ contentQuery := bleve.NewMatchQuery(opts.Keyword)
+ contentQuery.FieldVal = "Content"
+
if opts.IsKeywordFuzzy {
- phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
+ contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}
+ keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
+
if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
- searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+ searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(opts.Language) == 0 {
endIndex = locationEnd
}
}
+ if len(hit.Locations["Filename"]) > 0 {
+ startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
+ }
+
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
--- /dev/null
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+ "slices"
+ "strings"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/registry"
+)
+
+const (
+ Name = "gitea/path"
+)
+
+type TokenFilter struct{}
+
+func NewTokenFilter() *TokenFilter {
+ return &TokenFilter{}
+}
+
+func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
+ return NewTokenFilter(), nil
+}
+
+func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+ if len(input) == 1 {
+ // if there is only one token, we dont need to generate the reversed chain
+ return generatePathTokens(input, false)
+ }
+
+ normal := generatePathTokens(input, false)
+ reversed := generatePathTokens(input, true)
+
+ return append(normal, reversed...)
+}
+
+// Generates path tokens from the input tokens.
+// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
+// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
+//
+// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
+// to efficiently search for filenames without supplying the fullpath.
+func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
+ terms := make([]string, 0, len(input))
+ longestTerm := 0
+
+ if reversed {
+ slices.Reverse(input)
+ }
+
+ for i := 0; i < len(input); i++ {
+ var sb strings.Builder
+ sb.WriteString(string(input[0].Term))
+
+ for j := 1; j < i; j++ {
+ sb.WriteString("/")
+ sb.WriteString(string(input[j].Term))
+ }
+
+ term := sb.String()
+
+ if longestTerm < len(term) {
+ longestTerm = len(term)
+ }
+
+ terms = append(terms, term)
+ }
+
+ output := make(analysis.TokenStream, 0, len(terms))
+
+ for _, term := range terms {
+ var start, end int
+
+ if reversed {
+ start = 0
+ end = len(term)
+ } else {
+ start = longestTerm - len(term)
+ end = longestTerm
+ }
+
+ token := analysis.Token{
+ Position: 1,
+ Start: start,
+ End: end,
+ Type: analysis.AlphaNumeric,
+ Term: []byte(term),
+ }
+
+ output = append(output, &token)
+ }
+
+ return output
+}
+
+func init() {
+ registry.RegisterTokenFilter(Name, TokenFilterConstructor)
+}
--- /dev/null
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package path
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/blevesearch/bleve/v2/analysis"
+ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+ "github.com/stretchr/testify/assert"
+)
+
+type Scenario struct {
+ Input string
+ Tokens []string
+}
+
+func TestTokenFilter(t *testing.T) {
+ scenarios := []struct {
+ Input string
+ Terms []string
+ }{
+ {
+ Input: "Dockerfile",
+ Terms: []string{"Dockerfile"},
+ },
+ {
+ Input: "Dockerfile.rootless",
+ Terms: []string{"Dockerfile.rootless"},
+ },
+ {
+ Input: "a/b/c/Dockerfile.rootless",
+ Terms: []string{"a", "a/b", "a/b/c", "a/b/c/Dockerfile.rootless", "Dockerfile.rootless", "Dockerfile.rootless/c", "Dockerfile.rootless/c/b", "Dockerfile.rootless/c/b/a"},
+ },
+ {
+ Input: "",
+ Terms: []string{},
+ },
+ }
+
+ for _, scenario := range scenarios {
+ t.Run(fmt.Sprintf("ensure terms of '%s'", scenario.Input), func(t *testing.T) {
+ terms := extractTerms(scenario.Input)
+
+ assert.Len(t, terms, len(scenario.Terms))
+
+ for _, term := range terms {
+ assert.Contains(t, scenario.Terms, term)
+ }
+ })
+ }
+}
+
+func extractTerms(input string) []string {
+ tokens := tokenize(input)
+ filteredTokens := filter(tokens)
+ terms := make([]string, 0, len(filteredTokens))
+
+ for _, token := range filteredTokens {
+ terms = append(terms, string(token.Term))
+ }
+
+ return terms
+}
+
+func filter(input analysis.TokenStream) analysis.TokenStream {
+ filter := NewTokenFilter()
+ return filter.Filter(input)
+}
+
+func tokenize(input string) analysis.TokenStream {
+ tokenizer := unicode.NewUnicodeTokenizer()
+ return tokenizer.Tokenize([]byte(input))
+}
)
const (
- esRepoIndexerLatestVersion = 1
+ esRepoIndexerLatestVersion = 2
// multi-match-types, currently only 2 types are used
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
esMultiMatchTypeBestFields = "best_fields"
const (
defaultMapping = `{
+ "settings": {
+ "analysis": {
+ "analyzer": {
+ "filename_path_analyzer": {
+ "tokenizer": "path_tokenizer"
+ },
+ "reversed_filename_path_analyzer": {
+ "tokenizer": "reversed_path_tokenizer"
+ }
+ },
+ "tokenizer": {
+ "path_tokenizer": {
+ "type": "path_hierarchy",
+ "delimiter": "/"
+ },
+ "reversed_path_tokenizer": {
+ "type": "path_hierarchy",
+ "delimiter": "/",
+ "reverse": true
+ }
+ }
+ }
+ },
"mappings": {
"properties": {
"repo_id": {
"type": "long",
"index": true
},
+ "filename": {
+ "type": "text",
+ "term_vector": "with_positions_offsets",
+ "index": true,
+ "fields": {
+ "path": {
+ "type": "text",
+ "analyzer": "reversed_filename_path_analyzer"
+ },
+ "path_reversed": {
+ "type": "text",
+ "analyzer": "filename_path_analyzer"
+ }
+ }
+ },
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
Id(id).
Doc(map[string]any{
"repo_id": repo.ID,
+ "filename": update.Filename,
"content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
"commit_id": sha,
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
return err
}
-// indexPos find words positions for start and the following end on content. It will
+// contentMatchIndexPos find words positions for start and the following end on content. It will
// return the beginning position of the first start and the ending position of the
// first end following the start string.
// If not found any of the positions, it will return -1, -1.
-func indexPos(content, start, end string) (int, int) {
+func contentMatchIndexPos(content, start, end string) (int, int) {
startIdx := strings.Index(content, start)
if startIdx < 0 {
return -1, -1
if endIdx < 0 {
return -1, -1
}
- return startIdx, startIdx + len(start) + endIdx + len(end)
+ return startIdx, (startIdx + len(start) + endIdx + len(end)) - 9 // remove the length <em></em> since we give Content the original data
}
func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
hits := make([]*internal.SearchResult, 0, pageSize)
for _, hit := range searchResult.Hits.Hits {
+ repoID, fileName := internal.ParseIndexerID(hit.Id)
+ res := make(map[string]any)
+ if err := json.Unmarshal(hit.Source, &res); err != nil {
+ return 0, nil, nil, err
+ }
+
// FIXME: There is no way to get the position the keyword on the content currently on the same request.
// So we get it from content, this may made the query slower. See
// https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
var startIndex, endIndex int
- c, ok := hit.Highlight["content"]
- if ok && len(c) > 0 {
+ if c, ok := hit.Highlight["filename"]; ok && len(c) > 0 {
+ startIndex, endIndex = internal.FilenameMatchIndexPos(res["content"].(string))
+ } else if c, ok := hit.Highlight["content"]; ok && len(c) > 0 {
// FIXME: Since the highlighting content will include <em> and </em> for the keywords,
// now we should find the positions. But how to avoid html content which contains the
// <em> and </em> tags? If elastic search has handled that?
- startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
+ startIndex, endIndex = contentMatchIndexPos(c[0], "<em>", "</em>")
if startIndex == -1 {
panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
}
panic(fmt.Sprintf("2===%#v", hit.Highlight))
}
- repoID, fileName := internal.ParseIndexerID(hit.Id)
- res := make(map[string]any)
- if err := json.Unmarshal(hit.Source, &res); err != nil {
- return 0, nil, nil, err
- }
-
language := res["language"].(string)
hits = append(hits, &internal.SearchResult{
UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
Language: language,
StartIndex: startIndex,
- EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data
+ EndIndex: endIndex,
Color: enry.GetColor(language),
})
}
searchType = esMultiMatchTypeBestFields
}
- kwQuery := elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType)
+ kwQuery := elastic.NewBoolQuery().Should(
+ elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType),
+ elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix),
+ )
query := elastic.NewBoolQuery()
query = query.Must(kwQuery)
if len(opts.RepoIDs) > 0 {
Highlight(
elastic.NewHighlight().
Field("content").
+ Field("filename").
NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"),
).
Highlight(
elastic.NewHighlight().
Field("content").
+ Field("filename").
NumOfFragments(0). // return all highting content on fragments
HighlighterType("fvh"),
).
)
func TestIndexPos(t *testing.T) {
- startIdx, endIdx := indexPos("test index start and end", "start", "end")
+ startIdx, endIdx := contentMatchIndexPos("test index start and end", "start", "end")
assert.EqualValues(t, 11, startIdx)
- assert.EqualValues(t, 24, endIdx)
+ assert.EqualValues(t, 15, endIdx)
}
import (
"context"
"os"
+ "slices"
"testing"
"code.gitea.io/gitea/models/db"
_ "code.gitea.io/gitea/models/activities"
"github.com/stretchr/testify/assert"
+
+ _ "github.com/mattn/go-sqlite3"
)
+type codeSearchResult struct {
+ Filename string
+ Content string
+}
+
func TestMain(m *testing.M) {
unittest.MainTest(m)
}
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
t.Run(name, func(t *testing.T) {
- var repoID int64 = 1
- err := index(git.DefaultContext, indexer, repoID)
- assert.NoError(t, err)
+ assert.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer))
+
keywords := []struct {
RepoIDs []int64
Keyword string
- IDs []int64
Langs int
+ Results []codeSearchResult
}{
+ // Search for an exact match on the contents of a file
+ // This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "Description",
- IDs: []int64{repoID},
Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "README.md",
+ Content: "# repo1\n\nDescription for repo1",
+ },
+ },
},
+ // Search for an exact match on the contents of a file within the repo '2'.
+ // This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "Description",
- IDs: []int64{},
Langs: 0,
},
+ // Search for an exact match on the contents of a file
+ // This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "repo1",
- IDs: []int64{repoID},
Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "README.md",
+ Content: "# repo1\n\nDescription for repo1",
+ },
+ },
},
+ // Search for an exact match on the contents of a file within the repo '2'.
+ // This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "repo1",
- IDs: []int64{},
Langs: 0,
},
+ // Search for a non-existing term.
+ // This scenario yields no results
{
RepoIDs: nil,
Keyword: "non-exist",
- IDs: []int64{},
Langs: 0,
},
+ // Search for an exact match on the contents of a file within the repo '62'.
+ // This scenario yields a single result (the file avocado.md on the repo '62')
+ {
+ RepoIDs: []int64{62},
+ Keyword: "pineaple",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for an exact match on the filename within the repo '62'.
+ // This scenario yields a single result (the file avocado.md on the repo '62')
+ {
+ RepoIDs: []int64{62},
+ Keyword: "avocado.md",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for an partial match on the filename within the repo '62'.
+ // This scenario yields a single result (the file avocado.md on the repo '62')
+ {
+ RepoIDs: []int64{62},
+ Keyword: "avo",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for matches on both the contents and the filenames within the repo '62'.
+ // This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents
+ {
+ RepoIDs: []int64{62},
+ Keyword: "cucumber",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "cucumber.md",
+ Content: "Salad is good for your health",
+ },
+ {
+ Filename: "avocado.md",
+ Content: "# repo1\n\npineaple pie of cucumber juice",
+ },
+ },
+ },
+ // Search for matches on the filenames within the repo '62'.
+ // This scenario yields two results (both are based on filename, the first one is an exact match)
+ {
+ RepoIDs: []int64{62},
+ Keyword: "ham",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "ham.md",
+ Content: "This is also not cheese",
+ },
+ {
+ Filename: "potato/ham.md",
+ Content: "This is not cheese",
+ },
+ },
+ },
+ // Search for matches on the contents of files within the repo '62'.
+ // This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
+ {
+ RepoIDs: []int64{62},
+ Keyword: "This is not cheese",
+ Langs: 1,
+ Results: []codeSearchResult{
+ {
+ Filename: "potato/ham.md",
+ Content: "This is not cheese",
+ },
+ {
+ Filename: "ham.md",
+ Content: "This is also not cheese",
+ },
+ },
+ },
}
for _, kw := range keywords {
IsKeywordFuzzy: true,
})
assert.NoError(t, err)
- assert.Len(t, kw.IDs, int(total))
assert.Len(t, langs, kw.Langs)
- ids := make([]int64, 0, len(res))
+ hits := make([]codeSearchResult, 0, len(res))
+
+ if total > 0 {
+ assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
+ }
+
for _, hit := range res {
- ids = append(ids, hit.RepoID)
- assert.EqualValues(t, "# repo1\n\nDescription for repo1", hit.Content)
+ hits = append(hits, codeSearchResult{
+ Filename: hit.Filename,
+ Content: hit.Content,
+ })
+ }
+
+ lastIndex := -1
+
+ for _, expected := range kw.Results {
+ index := slices.Index(hits, expected)
+ if index == -1 {
+ assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
+ } else if lastIndex > index {
+ assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
+ } else {
+ lastIndex = index
+ }
}
- assert.EqualValues(t, kw.IDs, ids)
})
}
- assert.NoError(t, indexer.Delete(context.Background(), repoID))
+ assert.NoError(t, tearDownRepositoryIndexes(indexer))
})
}
testIndexer("elastic_search", t, indexer)
}
+
+func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
+ for _, repoID := range repositoriesToSearch() {
+ if err := index(ctx, indexer, repoID); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func tearDownRepositoryIndexes(indexer internal.Indexer) error {
+ for _, repoID := range repositoriesToSearch() {
+ if err := indexer.Delete(context.Background(), repoID); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func repositoriesToSearch() []int64 {
+ return []int64{1, 62}
+}
"code.gitea.io/gitea/modules/log"
)
+const (
+ filenameMatchNumberOfLines = 7 // Copied from github search
+)
+
func FilenameIndexerID(repoID int64, filename string) string {
return internal.Base36(repoID) + "_" + filename
}
}
return indexerID[index+1:]
}
+
+// Given the contents of file, returns the boundaries of its first seven lines.
+func FilenameMatchIndexPos(content string) (int, int) {
+ count := 1
+ for i, c := range content {
+ if c == '\n' {
+ count++
+ if count == filenameMatchNumberOfLines {
+ return 0, i
+ }
+ }
+ }
+ return 0, len(content)
+}
"code.gitea.io/gitea/modules/util"
"github.com/blevesearch/bleve/v2"
+ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/index/upsidedown"
"github.com/ethantkoenig/rupture"
)
+const (
+ maxFuzziness = 2
+)
+
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
return index, 0, nil
}
+// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
+// may be different on two string and they still be considered equivalent.
+// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
func GuessFuzzinessByKeyword(s string) int {
+ tokenizer := unicode.NewUnicodeTokenizer()
+ tokens := tokenizer.Tokenize([]byte(s))
+
+ if len(tokens) > 0 {
+ fuzziness := maxFuzziness
+
+ for _, token := range tokens {
+ fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
+ }
+
+ return fuzziness
+ }
+
+ return 0
+}
+
+func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
return 0
}
}
- return min(2, len(s)/4)
+ return min(maxFuzziness, len(s)/4)
}
--- /dev/null
+// Copyright 2024 The Gitea Authors. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+package bleve
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+)
+
+func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
+ scenarios := []struct {
+ Input string
+ Fuzziness int // See util.go for the definition of fuzziness in this particular context
+ }{
+ {
+ Input: "",
+ Fuzziness: 0,
+ },
+ {
+ Input: "Avocado",
+ Fuzziness: 1,
+ },
+ {
+ Input: "Geschwindigkeit",
+ Fuzziness: 2,
+ },
+ {
+ Input: "non-exist",
+ Fuzziness: 0,
+ },
+ {
+ Input: "갃갃갃",
+ Fuzziness: 0,
+ },
+ }
+
+ for _, scenario := range scenarios {
+ t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
+ assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
+ })
+ }
+}
--- /dev/null
+ref: refs/heads/master
--- /dev/null
+[core]
+ repositoryformatversion = 0
+ filemode = true
+ bare = true
--- /dev/null
+This repository will be used to test code search. The snippet below shows its directory structure
+
+.
+├── avocado.md
+├── cucumber.md
+├── ham.md
+└── potato
+ └── ham.md
--- /dev/null
+#!/usr/bin/env bash
+ORI_DIR=`pwd`
+SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
+cd "$ORI_DIR"
+for i in `ls "$SHELL_FOLDER/post-receive.d"`; do
+ sh "$SHELL_FOLDER/post-receive.d/$i"
+done
\ No newline at end of file
--- /dev/null
+#!/usr/bin/env bash
+"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" post-receive
--- /dev/null
+#!/usr/bin/env bash
+ORI_DIR=`pwd`
+SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
+cd "$ORI_DIR"
+for i in `ls "$SHELL_FOLDER/pre-receive.d"`; do
+ sh "$SHELL_FOLDER/pre-receive.d/$i"
+done
\ No newline at end of file
--- /dev/null
+#!/usr/bin/env bash
+"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" pre-receive
--- /dev/null
+#!/usr/bin/env bash
+ORI_DIR=`pwd`
+SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
+cd "$ORI_DIR"
+for i in `ls "$SHELL_FOLDER/proc-receive.d"`; do
+ sh "$SHELL_FOLDER/proc-receive.d/$i"
+done
--- /dev/null
+#!/usr/bin/env bash
+"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" proc-receive
--- /dev/null
+#!/usr/bin/env bash
+ORI_DIR=`pwd`
+SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
+cd "$ORI_DIR"
+for i in `ls "$SHELL_FOLDER/update.d"`; do
+ sh "$SHELL_FOLDER/update.d/$i" $1 $2 $3
+done
\ No newline at end of file
--- /dev/null
+#!/usr/bin/env bash
+"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" update $1 $2 $3
--- /dev/null
+# git ls-files --others --exclude-from=.git/info/exclude
+# Lines that start with '#' are comments.
+# For a project mostly in C, the following would be a good set of
+# exclude patterns (uncomment them if you want to use them):
+# *.[oa]
+# *~
--- /dev/null
+90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch
+985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2
+65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop
+65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1
+78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check
+3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master
+62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update
+4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check
+3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits
+4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head
+5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head
+62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head
+65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1
--- /dev/null
+P pack-393dc29256bc27cb2ec73898507df710be7a3cf5.pack
+
--- /dev/null
+# pack-refs with: peeled fully-peeled sorted
+90c1019714259b24fb81711d4416ac0f18667dfa refs/heads/DefaultBranch
+985f0301dba5e7b34be866819cd15ad3d8f508ee refs/heads/branch2
+65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/develop
+65f1bf27bc3bf70f64657658635e66094edbcb4d refs/heads/feature/1
+78fb907e3a3309eae4fe8fef030874cebbf1cd5e refs/heads/home-md-img-check
+3731fe53b763859aaf83e703ee731f6b9447ff1e refs/heads/master
+62fb502a7172d4453f0322a2cc85bddffa57f07a refs/heads/pr-to-update
+4649299398e4d39a5c09eb4f534df6f1e1eb87cc refs/heads/sub-home-md-img-check
+3fa2f829675543ecfc16b2891aebe8bf0608a8f4 refs/notes/commits
+4a357436d925b5c974181ff12a994538ddc5a269 refs/pull/2/head
+5f22f7d0d95d614d25a5b68592adb345a4b5c7fd refs/pull/3/head
+62fb502a7172d4453f0322a2cc85bddffa57f07a refs/pull/5/head
+65f1bf27bc3bf70f64657658635e66094edbcb4d refs/tags/v1.1
var apiOrgList []*api.Organization
DecodeJSON(t, resp, &apiOrgList)
- assert.Len(t, apiOrgList, 12)
+ assert.Len(t, apiOrgList, 13)
assert.Equal(t, "Limited Org 36", apiOrgList[1].FullName)
assert.Equal(t, "limited", apiOrgList[1].Visibility)
resp = MakeRequest(t, req, http.StatusOK)
DecodeJSON(t, resp, &apiOrgList)
- assert.Len(t, apiOrgList, 8)
+ assert.Len(t, apiOrgList, 9)
assert.Equal(t, "org 17", apiOrgList[0].FullName)
assert.Equal(t, "public", apiOrgList[0].Visibility)
}
}{
{
name: "RepositoriesMax50", requestURL: "/api/v1/repos/search?limit=50&private=false", expectedResults: expectedResults{
- nil: {count: 35},
- user: {count: 35},
- user2: {count: 35},
+ nil: {count: 36},
+ user: {count: 36},
+ user2: {count: 36},
},
},
{