aboutsummaryrefslogtreecommitdiffstats
path: root/modules/indexer/code/bleve.go
diff options
context:
space:
mode:
authorLauris BH <lauris@nix.lv>2020-02-20 21:53:55 +0200
committerGitHub <noreply@github.com>2020-02-20 16:53:55 -0300
commit3c45cf8494fcd29e1a99b0ee6f253808eb607053 (patch)
treecd0e6347bcd2bfc42c18408169a8757ca1dda920 /modules/indexer/code/bleve.go
parentefbd7ca39bde69ff84d4b309bee99edfe2977521 (diff)
downloadgitea-3c45cf8494fcd29e1a99b0ee6f253808eb607053.tar.gz
gitea-3c45cf8494fcd29e1a99b0ee6f253808eb607053.zip
Add detected file language to code search (#10256)
Move langauge detection to separate module to be more reusable Add option to disable vendored file exclusion from file search Allways show all language stats for search
Diffstat (limited to 'modules/indexer/code/bleve.go')
-rw-r--r--modules/indexer/code/bleve.go114
1 files changed, 96 insertions, 18 deletions
diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go
index 6052304f83..39171d17a6 100644
--- a/modules/indexer/code/bleve.go
+++ b/modules/indexer/code/bleve.go
@@ -9,16 +9,20 @@ import (
"os"
"strconv"
"strings"
+ "time"
"code.gitea.io/gitea/models"
+ "code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/timeutil"
"github.com/blevesearch/bleve"
- "github.com/blevesearch/bleve/analysis/analyzer/custom"
+ analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom"
+ analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
@@ -26,6 +30,7 @@ import (
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
+ "github.com/src-d/enry/v2"
)
const unicodeNormalizeName = "unicodeNormalize"
@@ -86,8 +91,11 @@ func openIndexer(path string, latestVersion int) (bleve.Index, error) {
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
- RepoID int64
- Content string
+ RepoID int64
+ CommitID string
+ Content string
+ Language string
+ UpdatedAt time.Time
}
// Type returns the document type, for bleve's mapping.Classifier interface.
@@ -95,7 +103,11 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
-func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
+func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
+ // Ignore vendored files in code search
+ if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
+ return nil
+ }
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
@@ -118,8 +130,11 @@ func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.Flushin
id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
- RepoID: repo.ID,
- Content: string(charset.ToUTF8DropErrors(fileContents)),
+ RepoID: repo.ID,
+ CommitID: commitSha,
+ Content: string(charset.ToUTF8DropErrors(fileContents)),
+ Language: analyze.GetCodeLanguage(update.Filename, fileContents),
+ UpdatedAt: time.Now().UTC(),
})
}
@@ -131,7 +146,7 @@ func addDelete(filename string, repo *models.Repository, batch rupture.FlushingB
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
- repoIndexerLatestVersion = 4
+ repoIndexerLatestVersion = 5
)
// createRepoIndexer create a repo indexer if one does not already exist
@@ -145,11 +160,21 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
+ termFieldMapping := bleve.NewTextFieldMapping()
+ termFieldMapping.IncludeInAll = false
+ termFieldMapping.Analyzer = analyzer_keyword.Name
+ docMapping.AddFieldMappingsAt("Language", termFieldMapping)
+ docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
+
+ timeFieldMapping := bleve.NewDateTimeFieldMapping()
+ timeFieldMapping.IncludeInAll = false
+ docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
+
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
- "type": custom.Name,
+ "type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
@@ -255,7 +280,7 @@ func (b *BleveIndexer) Index(repoID int64) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
- if err := addUpdate(update, repo, batch); err != nil {
+ if err := addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
@@ -289,7 +314,7 @@ func (b *BleveIndexer) Delete(repoID int64) error {
// Search searches for files in the specified repo.
// Returns the matching file-paths
-func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) {
+func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
@@ -309,16 +334,35 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
indexerQuery = phraseQuery
}
+ // Save for reuse without language filter
+ facetQuery := indexerQuery
+ if len(language) > 0 {
+ languageQuery := bleve.NewMatchQuery(language)
+ languageQuery.FieldVal = "Language"
+ languageQuery.Analyzer = analyzer_keyword.Name
+
+ indexerQuery = bleve.NewConjunctionQuery(
+ indexerQuery,
+ languageQuery,
+ )
+ }
+
from := (page - 1) * pageSize
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
- searchRequest.Fields = []string{"Content", "RepoID"}
+ searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
+ if len(language) == 0 {
+ searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
+ }
+
result, err := b.indexer.Search(searchRequest)
if err != nil {
- return 0, nil, err
+ return 0, nil, nil, err
}
+ total := int64(result.Total)
+
searchResults := make([]*SearchResult, len(result.Hits))
for i, hit := range result.Hits {
var startIndex, endIndex int = -1, -1
@@ -333,13 +377,47 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
endIndex = locationEnd
}
}
+ language := hit.Fields["Language"].(string)
+ var updatedUnix timeutil.TimeStamp
+ if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
+ updatedUnix = timeutil.TimeStamp(t.Unix())
+ }
searchResults[i] = &SearchResult{
- RepoID: int64(hit.Fields["RepoID"].(float64)),
- StartIndex: startIndex,
- EndIndex: endIndex,
- Filename: filenameOfIndexerID(hit.ID),
- Content: hit.Fields["Content"].(string),
+ RepoID: int64(hit.Fields["RepoID"].(float64)),
+ StartIndex: startIndex,
+ EndIndex: endIndex,
+ Filename: filenameOfIndexerID(hit.ID),
+ Content: hit.Fields["Content"].(string),
+ CommitID: hit.Fields["CommitID"].(string),
+ UpdatedUnix: updatedUnix,
+ Language: language,
+ Color: enry.GetColor(language),
+ }
+ }
+
+ searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
+ if len(language) > 0 {
+ // Use separate query to go get all language counts
+ facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
+ facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+ facetRequest.IncludeLocations = true
+ facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
+
+ if result, err = b.indexer.Search(facetRequest); err != nil {
+ return 0, nil, nil, err
+ }
+
+ }
+ languagesFacet := result.Facets["languages"]
+ for _, term := range languagesFacet.Terms {
+ if len(term.Term) == 0 {
+ continue
}
+ searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
+ Language: term.Term,
+ Color: enry.GetColor(term.Term),
+ Count: term.Count,
+ })
}
- return int64(result.Total), searchResults, nil
+ return total, searchResults, searchResultLanguages, nil
}