summaryrefslogtreecommitdiffstats
path: root/modules/indexer
diff options
context:
space:
mode:
authorLauris BH <lauris@nix.lv>2020-02-20 21:53:55 +0200
committerGitHub <noreply@github.com>2020-02-20 16:53:55 -0300
commit3c45cf8494fcd29e1a99b0ee6f253808eb607053 (patch)
treecd0e6347bcd2bfc42c18408169a8757ca1dda920 /modules/indexer
parentefbd7ca39bde69ff84d4b309bee99edfe2977521 (diff)
downloadgitea-3c45cf8494fcd29e1a99b0ee6f253808eb607053.tar.gz
gitea-3c45cf8494fcd29e1a99b0ee6f253808eb607053.zip
Add detected file language to code search (#10256)
Move langauge detection to separate module to be more reusable Add option to disable vendored file exclusion from file search Allways show all language stats for search
Diffstat (limited to 'modules/indexer')
-rw-r--r--modules/indexer/code/bleve.go114
-rw-r--r--modules/indexer/code/bleve_test.go9
-rw-r--r--modules/indexer/code/indexer.go24
-rw-r--r--modules/indexer/code/search.go21
-rw-r--r--modules/indexer/code/wrapped.go6
5 files changed, 140 insertions, 34 deletions
diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go
index 6052304f83..39171d17a6 100644
--- a/modules/indexer/code/bleve.go
+++ b/modules/indexer/code/bleve.go
@@ -9,16 +9,20 @@ import (
"os"
"strconv"
"strings"
+ "time"
"code.gitea.io/gitea/models"
+ "code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/timeutil"
"github.com/blevesearch/bleve"
- "github.com/blevesearch/bleve/analysis/analyzer/custom"
+ analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom"
+ analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
@@ -26,6 +30,7 @@ import (
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
+ "github.com/src-d/enry/v2"
)
const unicodeNormalizeName = "unicodeNormalize"
@@ -86,8 +91,11 @@ func openIndexer(path string, latestVersion int) (bleve.Index, error) {
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
- RepoID int64
- Content string
+ RepoID int64
+ CommitID string
+ Content string
+ Language string
+ UpdatedAt time.Time
}
// Type returns the document type, for bleve's mapping.Classifier interface.
@@ -95,7 +103,11 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
-func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
+func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
+ // Ignore vendored files in code search
+ if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
+ return nil
+ }
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
RunInDir(repo.RepoPath())
if err != nil {
@@ -118,8 +130,11 @@ func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.Flushin
id := filenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
- RepoID: repo.ID,
- Content: string(charset.ToUTF8DropErrors(fileContents)),
+ RepoID: repo.ID,
+ CommitID: commitSha,
+ Content: string(charset.ToUTF8DropErrors(fileContents)),
+ Language: analyze.GetCodeLanguage(update.Filename, fileContents),
+ UpdatedAt: time.Now().UTC(),
})
}
@@ -131,7 +146,7 @@ func addDelete(filename string, repo *models.Repository, batch rupture.FlushingB
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
- repoIndexerLatestVersion = 4
+ repoIndexerLatestVersion = 5
)
// createRepoIndexer create a repo indexer if one does not already exist
@@ -145,11 +160,21 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
+ termFieldMapping := bleve.NewTextFieldMapping()
+ termFieldMapping.IncludeInAll = false
+ termFieldMapping.Analyzer = analyzer_keyword.Name
+ docMapping.AddFieldMappingsAt("Language", termFieldMapping)
+ docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
+
+ timeFieldMapping := bleve.NewDateTimeFieldMapping()
+ timeFieldMapping.IncludeInAll = false
+ docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
+
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
- "type": custom.Name,
+ "type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
@@ -255,7 +280,7 @@ func (b *BleveIndexer) Index(repoID int64) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
- if err := addUpdate(update, repo, batch); err != nil {
+ if err := addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
@@ -289,7 +314,7 @@ func (b *BleveIndexer) Delete(repoID int64) error {
// Search searches for files in the specified repo.
// Returns the matching file-paths
-func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) {
+func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
@@ -309,16 +334,35 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
indexerQuery = phraseQuery
}
+ // Save for reuse without language filter
+ facetQuery := indexerQuery
+ if len(language) > 0 {
+ languageQuery := bleve.NewMatchQuery(language)
+ languageQuery.FieldVal = "Language"
+ languageQuery.Analyzer = analyzer_keyword.Name
+
+ indexerQuery = bleve.NewConjunctionQuery(
+ indexerQuery,
+ languageQuery,
+ )
+ }
+
from := (page - 1) * pageSize
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
- searchRequest.Fields = []string{"Content", "RepoID"}
+ searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
+ if len(language) == 0 {
+ searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
+ }
+
result, err := b.indexer.Search(searchRequest)
if err != nil {
- return 0, nil, err
+ return 0, nil, nil, err
}
+ total := int64(result.Total)
+
searchResults := make([]*SearchResult, len(result.Hits))
for i, hit := range result.Hits {
var startIndex, endIndex int = -1, -1
@@ -333,13 +377,47 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
endIndex = locationEnd
}
}
+ language := hit.Fields["Language"].(string)
+ var updatedUnix timeutil.TimeStamp
+ if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
+ updatedUnix = timeutil.TimeStamp(t.Unix())
+ }
searchResults[i] = &SearchResult{
- RepoID: int64(hit.Fields["RepoID"].(float64)),
- StartIndex: startIndex,
- EndIndex: endIndex,
- Filename: filenameOfIndexerID(hit.ID),
- Content: hit.Fields["Content"].(string),
+ RepoID: int64(hit.Fields["RepoID"].(float64)),
+ StartIndex: startIndex,
+ EndIndex: endIndex,
+ Filename: filenameOfIndexerID(hit.ID),
+ Content: hit.Fields["Content"].(string),
+ CommitID: hit.Fields["CommitID"].(string),
+ UpdatedUnix: updatedUnix,
+ Language: language,
+ Color: enry.GetColor(language),
+ }
+ }
+
+ searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
+ if len(language) > 0 {
+ // Use separate query to go get all language counts
+ facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
+ facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
+ facetRequest.IncludeLocations = true
+ facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
+
+ if result, err = b.indexer.Search(facetRequest); err != nil {
+ return 0, nil, nil, err
+ }
+
+ }
+ languagesFacet := result.Facets["languages"]
+ for _, term := range languagesFacet.Terms {
+ if len(term.Term) == 0 {
+ continue
}
+ searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
+ Language: term.Term,
+ Color: enry.GetColor(term.Term),
+ Count: term.Count,
+ })
}
- return int64(result.Total), searchResults, nil
+ return total, searchResults, searchResultLanguages, nil
}
diff --git a/modules/indexer/code/bleve_test.go b/modules/indexer/code/bleve_test.go
index 695dceb259..89cfceea2d 100644
--- a/modules/indexer/code/bleve_test.go
+++ b/modules/indexer/code/bleve_test.go
@@ -49,27 +49,34 @@ func TestIndexAndSearch(t *testing.T) {
keywords = []struct {
Keyword string
IDs []int64
+ Langs int
}{
{
Keyword: "Description",
IDs: []int64{1},
+ Langs: 1,
},
{
Keyword: "repo1",
IDs: []int64{1},
+ Langs: 1,
},
{
Keyword: "non-exist",
IDs: []int64{},
+ Langs: 0,
},
}
)
for _, kw := range keywords {
- total, res, err := idx.Search(nil, kw.Keyword, 1, 10)
+ total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)
+ assert.NotNil(t, langs)
+ assert.Len(t, langs, kw.Langs)
+
var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go
index 3f9461cd0e..6cbda1491b 100644
--- a/modules/indexer/code/indexer.go
+++ b/modules/indexer/code/indexer.go
@@ -12,22 +12,34 @@ import (
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
+ "code.gitea.io/gitea/modules/timeutil"
)
// SearchResult result of performing a search in a repo
type SearchResult struct {
- RepoID int64
- StartIndex int
- EndIndex int
- Filename string
- Content string
+ RepoID int64
+ StartIndex int
+ EndIndex int
+ Filename string
+ Content string
+ CommitID string
+ UpdatedUnix timeutil.TimeStamp
+ Language string
+ Color string
+}
+
+// SearchResultLanguages result of top languages count in search results
+type SearchResultLanguages struct {
+ Language string
+ Color string
+ Count int
}
// Indexer defines an interface to indexer issues contents
type Indexer interface {
Index(repoID int64) error
Delete(repoID int64) error
- Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error)
+ Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error)
Close()
}
diff --git a/modules/indexer/code/search.go b/modules/indexer/code/search.go
index 18f193a532..ca57b3ff88 100644
--- a/modules/indexer/code/search.go
+++ b/modules/indexer/code/search.go
@@ -11,6 +11,7 @@ import (
"strings"
"code.gitea.io/gitea/modules/highlight"
+ "code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/util"
)
@@ -18,6 +19,10 @@ import (
type Result struct {
RepoID int64
Filename string
+ CommitID string
+ UpdatedUnix timeutil.TimeStamp
+ Language string
+ Color string
HighlightClass string
LineNumbers []int
FormattedLines gotemplate.HTML
@@ -100,6 +105,10 @@ func searchResult(result *SearchResult, startIndex, endIndex int) (*Result, erro
return &Result{
RepoID: result.RepoID,
Filename: result.Filename,
+ CommitID: result.CommitID,
+ UpdatedUnix: result.UpdatedUnix,
+ Language: result.Language,
+ Color: result.Color,
HighlightClass: highlight.FileNameToHighlightClass(result.Filename),
LineNumbers: lineNumbers,
FormattedLines: gotemplate.HTML(formattedLinesBuffer.String()),
@@ -107,14 +116,14 @@ func searchResult(result *SearchResult, startIndex, endIndex int) (*Result, erro
}
// PerformSearch perform a search on a repository
-func PerformSearch(repoIDs []int64, keyword string, page, pageSize int) (int, []*Result, error) {
+func PerformSearch(repoIDs []int64, language, keyword string, page, pageSize int) (int, []*Result, []*SearchResultLanguages, error) {
if len(keyword) == 0 {
- return 0, nil, nil
+ return 0, nil, nil, nil
}
- total, results, err := indexer.Search(repoIDs, keyword, page, pageSize)
+ total, results, resultLanguages, err := indexer.Search(repoIDs, language, keyword, page, pageSize)
if err != nil {
- return 0, nil, err
+ return 0, nil, nil, err
}
displayResults := make([]*Result, len(results))
@@ -123,8 +132,8 @@ func PerformSearch(repoIDs []int64, keyword string, page, pageSize int) (int, []
startIndex, endIndex := indices(result.Content, result.StartIndex, result.EndIndex)
displayResults[i], err = searchResult(result, startIndex, endIndex)
if err != nil {
- return 0, nil, err
+ return 0, nil, nil, err
}
}
- return int(total), displayResults, nil
+ return int(total), displayResults, resultLanguages, nil
}
diff --git a/modules/indexer/code/wrapped.go b/modules/indexer/code/wrapped.go
index 6a20883989..926597a382 100644
--- a/modules/indexer/code/wrapped.go
+++ b/modules/indexer/code/wrapped.go
@@ -71,12 +71,12 @@ func (w *wrappedIndexer) Delete(repoID int64) error {
return indexer.Delete(repoID)
}
-func (w *wrappedIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) {
+func (w *wrappedIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
indexer, err := w.get()
if err != nil {
- return 0, nil, err
+ return 0, nil, nil, err
}
- return indexer.Search(repoIDs, keyword, page, pageSize)
+ return indexer.Search(repoIDs, language, keyword, page, pageSize)
}