diff options
author | Lauris BH <lauris@nix.lv> | 2020-02-20 21:53:55 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-20 16:53:55 -0300 |
commit | 3c45cf8494fcd29e1a99b0ee6f253808eb607053 (patch) | |
tree | cd0e6347bcd2bfc42c18408169a8757ca1dda920 /modules | |
parent | efbd7ca39bde69ff84d4b309bee99edfe2977521 (diff) | |
download | gitea-3c45cf8494fcd29e1a99b0ee6f253808eb607053.tar.gz gitea-3c45cf8494fcd29e1a99b0ee6f253808eb607053.zip |
Add detected file language to code search (#10256)
Move langauge detection to separate module to be more reusable
Add option to disable vendored file exclusion from file search
Allways show all language stats for search
Diffstat (limited to 'modules')
-rw-r--r-- | modules/analyze/code_langauge.go | 36 | ||||
-rw-r--r-- | modules/git/repo_language_stats.go | 27 | ||||
-rw-r--r-- | modules/indexer/code/bleve.go | 114 | ||||
-rw-r--r-- | modules/indexer/code/bleve_test.go | 9 | ||||
-rw-r--r-- | modules/indexer/code/indexer.go | 24 | ||||
-rw-r--r-- | modules/indexer/code/search.go | 21 | ||||
-rw-r--r-- | modules/indexer/code/wrapped.go | 6 | ||||
-rw-r--r-- | modules/setting/indexer.go | 3 |
8 files changed, 188 insertions, 52 deletions
diff --git a/modules/analyze/code_langauge.go b/modules/analyze/code_langauge.go new file mode 100644 index 0000000000..f7dd3e7cfc --- /dev/null +++ b/modules/analyze/code_langauge.go @@ -0,0 +1,36 @@ +// Copyright 2020 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import ( + "path/filepath" + + "github.com/src-d/enry/v2" +) + +// GetCodeLanguageWithCallback detects code language based on file name and content using callback +func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string { + if language, ok := enry.GetLanguageByExtension(filename); ok { + return language + } + + if language, ok := enry.GetLanguageByFilename(filename); ok { + return language + } + + content, err := contentFunc() + if err != nil { + return enry.OtherLanguage + } + + return enry.GetLanguage(filepath.Base(filename), content) +} + +// GetCodeLanguage detects code language based on file name and content +func GetCodeLanguage(filename string, content []byte) string { + return GetCodeLanguageWithCallback(filename, func() ([]byte, error) { + return content, nil + }) +} diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index ffe6dd0848..305fb97795 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -9,7 +9,8 @@ import ( "io" "io/ioutil" "math" - "path/filepath" + + "code.gitea.io/gitea/modules/analyze" "github.com/src-d/enry/v2" "gopkg.in/src-d/go-git.v4" @@ -51,25 +52,15 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e // TODO: Use .gitattributes file for linguist overrides - language, ok := enry.GetLanguageByExtension(f.Name) - if !ok { - if language, ok = enry.GetLanguageByFilename(f.Name); !ok { - content, err := readFile(f, fileSizeLimit) - if err != nil { - return nil - } - - language = enry.GetLanguage(filepath.Base(f.Name), content) - if language == enry.OtherLanguage { - return nil - } - } + language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) { + return readFile(f, fileSizeLimit) + }) + if language == enry.OtherLanguage || language == "" { + return nil } - if language != "" { - sizes[language] += f.Size - total += f.Size - } + sizes[language] += f.Size + total += f.Size return nil }) diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go index 6052304f83..39171d17a6 100644 --- a/modules/indexer/code/bleve.go +++ b/modules/indexer/code/bleve.go @@ -9,16 +9,20 @@ import ( "os" "strconv" "strings" + "time" "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/analyze" "code.gitea.io/gitea/modules/base" "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" "github.com/blevesearch/bleve" - "github.com/blevesearch/bleve/analysis/analyzer/custom" + analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom" + analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword" "github.com/blevesearch/bleve/analysis/token/lowercase" "github.com/blevesearch/bleve/analysis/token/unicodenorm" "github.com/blevesearch/bleve/analysis/tokenizer/unicode" @@ -26,6 +30,7 @@ import ( "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search/query" "github.com/ethantkoenig/rupture" + "github.com/src-d/enry/v2" ) const unicodeNormalizeName = "unicodeNormalize" @@ -86,8 +91,11 @@ func openIndexer(path string, latestVersion int) (bleve.Index, error) { // RepoIndexerData data stored in the repo indexer type RepoIndexerData struct { - RepoID int64 - Content string + RepoID int64 + CommitID string + Content string + Language string + UpdatedAt time.Time } // Type returns the document type, for bleve's mapping.Classifier interface. @@ -95,7 +103,11 @@ func (d *RepoIndexerData) Type() string { return repoIndexerDocType } -func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { +func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { + // Ignore vendored files in code search + if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + return nil + } stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha). RunInDir(repo.RepoPath()) if err != nil { @@ -118,8 +130,11 @@ func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.Flushin id := filenameIndexerID(repo.ID, update.Filename) return batch.Index(id, &RepoIndexerData{ - RepoID: repo.ID, - Content: string(charset.ToUTF8DropErrors(fileContents)), + RepoID: repo.ID, + CommitID: commitSha, + Content: string(charset.ToUTF8DropErrors(fileContents)), + Language: analyze.GetCodeLanguage(update.Filename, fileContents), + UpdatedAt: time.Now().UTC(), }) } @@ -131,7 +146,7 @@ func addDelete(filename string, repo *models.Repository, batch rupture.FlushingB const ( repoIndexerAnalyzer = "repoIndexerAnalyzer" repoIndexerDocType = "repoIndexerDocType" - repoIndexerLatestVersion = 4 + repoIndexerLatestVersion = 5 ) // createRepoIndexer create a repo indexer if one does not already exist @@ -145,11 +160,21 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) { textFieldMapping.IncludeInAll = false docMapping.AddFieldMappingsAt("Content", textFieldMapping) + termFieldMapping := bleve.NewTextFieldMapping() + termFieldMapping.IncludeInAll = false + termFieldMapping.Analyzer = analyzer_keyword.Name + docMapping.AddFieldMappingsAt("Language", termFieldMapping) + docMapping.AddFieldMappingsAt("CommitID", termFieldMapping) + + timeFieldMapping := bleve.NewDateTimeFieldMapping() + timeFieldMapping.IncludeInAll = false + docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping) + mapping := bleve.NewIndexMapping() if err := addUnicodeNormalizeTokenFilter(mapping); err != nil { return nil, err } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{ - "type": custom.Name, + "type": analyzer_custom.Name, "char_filters": []string{}, "tokenizer": unicode.Name, "token_filters": []string{unicodeNormalizeName, lowercase.Name}, @@ -255,7 +280,7 @@ func (b *BleveIndexer) Index(repoID int64) error { batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize) for _, update := range changes.Updates { - if err := addUpdate(update, repo, batch); err != nil { + if err := addUpdate(sha, update, repo, batch); err != nil { return err } } @@ -289,7 +314,7 @@ func (b *BleveIndexer) Delete(repoID int64) error { // Search searches for files in the specified repo. // Returns the matching file-paths -func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) { +func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) { phraseQuery := bleve.NewMatchPhraseQuery(keyword) phraseQuery.FieldVal = "Content" phraseQuery.Analyzer = repoIndexerAnalyzer @@ -309,16 +334,35 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in indexerQuery = phraseQuery } + // Save for reuse without language filter + facetQuery := indexerQuery + if len(language) > 0 { + languageQuery := bleve.NewMatchQuery(language) + languageQuery.FieldVal = "Language" + languageQuery.Analyzer = analyzer_keyword.Name + + indexerQuery = bleve.NewConjunctionQuery( + indexerQuery, + languageQuery, + ) + } + from := (page - 1) * pageSize searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false) - searchRequest.Fields = []string{"Content", "RepoID"} + searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} searchRequest.IncludeLocations = true + if len(language) == 0 { + searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) + } + result, err := b.indexer.Search(searchRequest) if err != nil { - return 0, nil, err + return 0, nil, nil, err } + total := int64(result.Total) + searchResults := make([]*SearchResult, len(result.Hits)) for i, hit := range result.Hits { var startIndex, endIndex int = -1, -1 @@ -333,13 +377,47 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in endIndex = locationEnd } } + language := hit.Fields["Language"].(string) + var updatedUnix timeutil.TimeStamp + if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil { + updatedUnix = timeutil.TimeStamp(t.Unix()) + } searchResults[i] = &SearchResult{ - RepoID: int64(hit.Fields["RepoID"].(float64)), - StartIndex: startIndex, - EndIndex: endIndex, - Filename: filenameOfIndexerID(hit.ID), - Content: hit.Fields["Content"].(string), + RepoID: int64(hit.Fields["RepoID"].(float64)), + StartIndex: startIndex, + EndIndex: endIndex, + Filename: filenameOfIndexerID(hit.ID), + Content: hit.Fields["Content"].(string), + CommitID: hit.Fields["CommitID"].(string), + UpdatedUnix: updatedUnix, + Language: language, + Color: enry.GetColor(language), + } + } + + searchResultLanguages := make([]*SearchResultLanguages, 0, 10) + if len(language) > 0 { + // Use separate query to go get all language counts + facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false) + facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"} + facetRequest.IncludeLocations = true + facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10)) + + if result, err = b.indexer.Search(facetRequest); err != nil { + return 0, nil, nil, err + } + + } + languagesFacet := result.Facets["languages"] + for _, term := range languagesFacet.Terms { + if len(term.Term) == 0 { + continue } + searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{ + Language: term.Term, + Color: enry.GetColor(term.Term), + Count: term.Count, + }) } - return int64(result.Total), searchResults, nil + return total, searchResults, searchResultLanguages, nil } diff --git a/modules/indexer/code/bleve_test.go b/modules/indexer/code/bleve_test.go index 695dceb259..89cfceea2d 100644 --- a/modules/indexer/code/bleve_test.go +++ b/modules/indexer/code/bleve_test.go @@ -49,27 +49,34 @@ func TestIndexAndSearch(t *testing.T) { keywords = []struct { Keyword string IDs []int64 + Langs int }{ { Keyword: "Description", IDs: []int64{1}, + Langs: 1, }, { Keyword: "repo1", IDs: []int64{1}, + Langs: 1, }, { Keyword: "non-exist", IDs: []int64{}, + Langs: 0, }, } ) for _, kw := range keywords { - total, res, err := idx.Search(nil, kw.Keyword, 1, 10) + total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10) assert.NoError(t, err) assert.EqualValues(t, len(kw.IDs), total) + assert.NotNil(t, langs) + assert.Len(t, langs, kw.Langs) + var ids = make([]int64, 0, len(res)) for _, hit := range res { ids = append(ids, hit.RepoID) diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index 3f9461cd0e..6cbda1491b 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -12,22 +12,34 @@ import ( "code.gitea.io/gitea/modules/graceful" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/timeutil" ) // SearchResult result of performing a search in a repo type SearchResult struct { - RepoID int64 - StartIndex int - EndIndex int - Filename string - Content string + RepoID int64 + StartIndex int + EndIndex int + Filename string + Content string + CommitID string + UpdatedUnix timeutil.TimeStamp + Language string + Color string +} + +// SearchResultLanguages result of top languages count in search results +type SearchResultLanguages struct { + Language string + Color string + Count int } // Indexer defines an interface to indexer issues contents type Indexer interface { Index(repoID int64) error Delete(repoID int64) error - Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) + Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) Close() } diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go index 18f193a532..ca57b3ff88 100644 --- a/modules/indexer/code/search.go +++ b/modules/indexer/code/search.go @@ -11,6 +11,7 @@ import ( "strings" "code.gitea.io/gitea/modules/highlight" + "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/util" ) @@ -18,6 +19,10 @@ import ( type Result struct { RepoID int64 Filename string + CommitID string + UpdatedUnix timeutil.TimeStamp + Language string + Color string HighlightClass string LineNumbers []int FormattedLines gotemplate.HTML @@ -100,6 +105,10 @@ func searchResult(result *SearchResult, startIndex, endIndex int) (*Result, erro return &Result{ RepoID: result.RepoID, Filename: result.Filename, + CommitID: result.CommitID, + UpdatedUnix: result.UpdatedUnix, + Language: result.Language, + Color: result.Color, HighlightClass: highlight.FileNameToHighlightClass(result.Filename), LineNumbers: lineNumbers, FormattedLines: gotemplate.HTML(formattedLinesBuffer.String()), @@ -107,14 +116,14 @@ func searchResult(result *SearchResult, startIndex, endIndex int) (*Result, erro } // PerformSearch perform a search on a repository -func PerformSearch(repoIDs []int64, keyword string, page, pageSize int) (int, []*Result, error) { +func PerformSearch(repoIDs []int64, language, keyword string, page, pageSize int) (int, []*Result, []*SearchResultLanguages, error) { if len(keyword) == 0 { - return 0, nil, nil + return 0, nil, nil, nil } - total, results, err := indexer.Search(repoIDs, keyword, page, pageSize) + total, results, resultLanguages, err := indexer.Search(repoIDs, language, keyword, page, pageSize) if err != nil { - return 0, nil, err + return 0, nil, nil, err } displayResults := make([]*Result, len(results)) @@ -123,8 +132,8 @@ func PerformSearch(repoIDs []int64, keyword string, page, pageSize int) (int, [] startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex) displayResults[i], err = searchResult(result, startIndex, endIndex) if err != nil { - return 0, nil, err + return 0, nil, nil, err } } - return int(total), displayResults, nil + return int(total), displayResults, resultLanguages, nil } diff --git a/modules/indexer/code/wrapped.go b/modules/indexer/code/wrapped.go index 6a20883989..926597a382 100644 --- a/modules/indexer/code/wrapped.go +++ b/modules/indexer/code/wrapped.go @@ -71,12 +71,12 @@ func (w *wrappedIndexer) Delete(repoID int64) error { return indexer.Delete(repoID) } -func (w *wrappedIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) { +func (w *wrappedIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) { indexer, err := w.get() if err != nil { - return 0, nil, err + return 0, nil, nil, err } - return indexer.Search(repoIDs, keyword, page, pageSize) + return indexer.Search(repoIDs, language, keyword, page, pageSize) } diff --git a/modules/setting/indexer.go b/modules/setting/indexer.go index 859535281c..4d4df62014 100644 --- a/modules/setting/indexer.go +++ b/modules/setting/indexer.go @@ -41,6 +41,7 @@ var ( MaxIndexerFileSize int64 IncludePatterns []glob.Glob ExcludePatterns []glob.Glob + ExcludeVendored bool }{ IssueType: "bleve", IssuePath: "indexers/issues.bleve", @@ -52,6 +53,7 @@ var ( IssueQueueBatchNumber: 20, MaxIndexerFileSize: 1024 * 1024, + ExcludeVendored: true, } ) @@ -77,6 +79,7 @@ func newIndexerService() { } Indexer.IncludePatterns = IndexerGlobFromString(sec.Key("REPO_INDEXER_INCLUDE").MustString("")) Indexer.ExcludePatterns = IndexerGlobFromString(sec.Key("REPO_INDEXER_EXCLUDE").MustString("")) + Indexer.ExcludeVendored = sec.Key("REPO_INDEXER_EXCLUDE_VENDORED").MustBool(true) Indexer.UpdateQueueLength = sec.Key("UPDATE_BUFFER_LEN").MustInt(20) Indexer.MaxIndexerFileSize = sec.Key("MAX_FILE_SIZE").MustInt64(1024 * 1024) Indexer.StartupTimeout = sec.Key("STARTUP_TIMEOUT").MustDuration(30 * time.Second) |