123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512 |
- // Copyright 2020 The Gitea Authors. All rights reserved.
- // SPDX-License-Identifier: MIT
-
- package code
-
- import (
- "bufio"
- "context"
- "errors"
- "fmt"
- "io"
- "net"
- "strconv"
- "strings"
- "sync"
- "time"
-
- repo_model "code.gitea.io/gitea/models/repo"
- "code.gitea.io/gitea/modules/analyze"
- "code.gitea.io/gitea/modules/charset"
- "code.gitea.io/gitea/modules/git"
- "code.gitea.io/gitea/modules/graceful"
- "code.gitea.io/gitea/modules/json"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/setting"
- "code.gitea.io/gitea/modules/timeutil"
- "code.gitea.io/gitea/modules/typesniffer"
-
- "github.com/go-enry/go-enry/v2"
- "github.com/olivere/elastic/v7"
- )
-
- const (
- esRepoIndexerLatestVersion = 1
- // multi-match-types, currently only 2 types are used
- // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/7.0/query-dsl-multi-match-query.html#multi-match-types
- esMultiMatchTypeBestFields = "best_fields"
- esMultiMatchTypePhrasePrefix = "phrase_prefix"
- )
-
- var _ Indexer = &ElasticSearchIndexer{}
-
- // ElasticSearchIndexer implements Indexer interface
- type ElasticSearchIndexer struct {
- client *elastic.Client
- indexerAliasName string
- available bool
- stopTimer chan struct{}
- lock sync.RWMutex
- }
-
- // NewElasticSearchIndexer creates a new elasticsearch indexer
- func NewElasticSearchIndexer(url, indexerName string) (*ElasticSearchIndexer, bool, error) {
- opts := []elastic.ClientOptionFunc{
- elastic.SetURL(url),
- elastic.SetSniff(false),
- elastic.SetHealthcheckInterval(10 * time.Second),
- elastic.SetGzip(false),
- }
-
- logger := log.GetLogger(log.DEFAULT)
-
- opts = append(opts, elastic.SetTraceLog(&log.PrintfLogger{Logf: logger.Trace}))
- opts = append(opts, elastic.SetInfoLog(&log.PrintfLogger{Logf: logger.Info}))
- opts = append(opts, elastic.SetErrorLog(&log.PrintfLogger{Logf: logger.Error}))
-
- client, err := elastic.NewClient(opts...)
- if err != nil {
- return nil, false, err
- }
-
- indexer := &ElasticSearchIndexer{
- client: client,
- indexerAliasName: indexerName,
- available: true,
- stopTimer: make(chan struct{}),
- }
-
- ticker := time.NewTicker(10 * time.Second)
- go func() {
- for {
- select {
- case <-ticker.C:
- indexer.checkAvailability()
- case <-indexer.stopTimer:
- ticker.Stop()
- return
- }
- }
- }()
-
- exists, err := indexer.init()
- if err != nil {
- indexer.Close()
- return nil, false, err
- }
- return indexer, !exists, err
- }
-
- const (
- defaultMapping = `{
- "mappings": {
- "properties": {
- "repo_id": {
- "type": "long",
- "index": true
- },
- "content": {
- "type": "text",
- "term_vector": "with_positions_offsets",
- "index": true
- },
- "commit_id": {
- "type": "keyword",
- "index": true
- },
- "language": {
- "type": "keyword",
- "index": true
- },
- "updated_at": {
- "type": "long",
- "index": true
- }
- }
- }
- }`
- )
-
- func (b *ElasticSearchIndexer) realIndexerName() string {
- return fmt.Sprintf("%s.v%d", b.indexerAliasName, esRepoIndexerLatestVersion)
- }
-
- // Init will initialize the indexer
- func (b *ElasticSearchIndexer) init() (bool, error) {
- ctx := graceful.GetManager().HammerContext()
- exists, err := b.client.IndexExists(b.realIndexerName()).Do(ctx)
- if err != nil {
- return false, b.checkError(err)
- }
- if !exists {
- mapping := defaultMapping
-
- createIndex, err := b.client.CreateIndex(b.realIndexerName()).BodyString(mapping).Do(ctx)
- if err != nil {
- return false, b.checkError(err)
- }
- if !createIndex.Acknowledged {
- return false, fmt.Errorf("create index %s with %s failed", b.realIndexerName(), mapping)
- }
- }
-
- // check version
- r, err := b.client.Aliases().Do(ctx)
- if err != nil {
- return false, b.checkError(err)
- }
-
- realIndexerNames := r.IndicesByAlias(b.indexerAliasName)
- if len(realIndexerNames) < 1 {
- res, err := b.client.Alias().
- Add(b.realIndexerName(), b.indexerAliasName).
- Do(ctx)
- if err != nil {
- return false, b.checkError(err)
- }
- if !res.Acknowledged {
- return false, fmt.Errorf("create alias %s to index %s failed", b.indexerAliasName, b.realIndexerName())
- }
- } else if len(realIndexerNames) >= 1 && realIndexerNames[0] < b.realIndexerName() {
- log.Warn("Found older gitea indexer named %s, but we will create a new one %s and keep the old NOT DELETED. You can delete the old version after the upgrade succeed.",
- realIndexerNames[0], b.realIndexerName())
- res, err := b.client.Alias().
- Remove(realIndexerNames[0], b.indexerAliasName).
- Add(b.realIndexerName(), b.indexerAliasName).
- Do(ctx)
- if err != nil {
- return false, b.checkError(err)
- }
- if !res.Acknowledged {
- return false, fmt.Errorf("change alias %s to index %s failed", b.indexerAliasName, b.realIndexerName())
- }
- }
-
- return exists, nil
- }
-
- // Ping checks if elastic is available
- func (b *ElasticSearchIndexer) Ping() bool {
- b.lock.RLock()
- defer b.lock.RUnlock()
- return b.available
- }
-
- func (b *ElasticSearchIndexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, sha string, update fileUpdate, repo *repo_model.Repository) ([]elastic.BulkableRequest, error) {
- // Ignore vendored files in code search
- if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
- return nil, nil
- }
-
- size := update.Size
- var err error
- if !update.Sized {
- var stdout string
- stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
- if err != nil {
- return nil, err
- }
- if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
- return nil, fmt.Errorf("misformatted git cat-file output: %w", err)
- }
- }
-
- if size > setting.Indexer.MaxIndexerFileSize {
- return []elastic.BulkableRequest{b.addDelete(update.Filename, repo)}, nil
- }
-
- if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
- return nil, err
- }
-
- _, _, size, err = git.ReadBatchLine(batchReader)
- if err != nil {
- return nil, err
- }
-
- fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
- if err != nil {
- return nil, err
- } else if !typesniffer.DetectContentType(fileContents).IsText() {
- // FIXME: UTF-16 files will probably fail here
- return nil, nil
- }
-
- if _, err = batchReader.Discard(1); err != nil {
- return nil, err
- }
- id := filenameIndexerID(repo.ID, update.Filename)
-
- return []elastic.BulkableRequest{
- elastic.NewBulkIndexRequest().
- Index(b.indexerAliasName).
- Id(id).
- Doc(map[string]interface{}{
- "repo_id": repo.ID,
- "content": string(charset.ToUTF8DropErrors(fileContents)),
- "commit_id": sha,
- "language": analyze.GetCodeLanguage(update.Filename, fileContents),
- "updated_at": timeutil.TimeStampNow(),
- }),
- }, nil
- }
-
- func (b *ElasticSearchIndexer) addDelete(filename string, repo *repo_model.Repository) elastic.BulkableRequest {
- id := filenameIndexerID(repo.ID, filename)
- return elastic.NewBulkDeleteRequest().
- Index(b.indexerAliasName).
- Id(id)
- }
-
- // Index will save the index data
- func (b *ElasticSearchIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *repoChanges) error {
- reqs := make([]elastic.BulkableRequest, 0)
- if len(changes.Updates) > 0 {
- // Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
- if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
- log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
- return err
- }
-
- batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
- defer cancel()
-
- for _, update := range changes.Updates {
- updateReqs, err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo)
- if err != nil {
- return err
- }
- if len(updateReqs) > 0 {
- reqs = append(reqs, updateReqs...)
- }
- }
- cancel()
- }
-
- for _, filename := range changes.RemovedFilenames {
- reqs = append(reqs, b.addDelete(filename, repo))
- }
-
- if len(reqs) > 0 {
- _, err := b.client.Bulk().
- Index(b.indexerAliasName).
- Add(reqs...).
- Do(ctx)
- return b.checkError(err)
- }
- return nil
- }
-
- // Delete deletes indexes by ids
- func (b *ElasticSearchIndexer) Delete(repoID int64) error {
- _, err := b.client.DeleteByQuery(b.indexerAliasName).
- Query(elastic.NewTermsQuery("repo_id", repoID)).
- Do(graceful.GetManager().HammerContext())
- return b.checkError(err)
- }
-
- // indexPos find words positions for start and the following end on content. It will
- // return the beginning position of the first start and the ending position of the
- // first end following the start string.
- // If not found any of the positions, it will return -1, -1.
- func indexPos(content, start, end string) (int, int) {
- startIdx := strings.Index(content, start)
- if startIdx < 0 {
- return -1, -1
- }
- endIdx := strings.Index(content[startIdx+len(start):], end)
- if endIdx < 0 {
- return -1, -1
- }
- return startIdx, startIdx + len(start) + endIdx + len(end)
- }
-
- func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
- hits := make([]*SearchResult, 0, pageSize)
- for _, hit := range searchResult.Hits.Hits {
- // FIXME: There is no way to get the position the keyword on the content currently on the same request.
- // So we get it from content, this may made the query slower. See
- // https://discuss.elastic.co/t/fetching-position-of-keyword-in-matched-document/94291
- var startIndex, endIndex int
- c, ok := hit.Highlight["content"]
- if ok && len(c) > 0 {
- // FIXME: Since the highlighting content will include <em> and </em> for the keywords,
- // now we should find the positions. But how to avoid html content which contains the
- // <em> and </em> tags? If elastic search has handled that?
- startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
- if startIndex == -1 {
- panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
- }
- } else {
- panic(fmt.Sprintf("2===%#v", hit.Highlight))
- }
-
- repoID, fileName := parseIndexerID(hit.Id)
- res := make(map[string]interface{})
- if err := json.Unmarshal(hit.Source, &res); err != nil {
- return 0, nil, nil, err
- }
-
- language := res["language"].(string)
-
- hits = append(hits, &SearchResult{
- RepoID: repoID,
- Filename: fileName,
- CommitID: res["commit_id"].(string),
- Content: res["content"].(string),
- UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
- Language: language,
- StartIndex: startIndex,
- EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data
- Color: enry.GetColor(language),
- })
- }
-
- return searchResult.TotalHits(), hits, extractAggs(searchResult), nil
- }
-
- func extractAggs(searchResult *elastic.SearchResult) []*SearchResultLanguages {
- var searchResultLanguages []*SearchResultLanguages
- agg, found := searchResult.Aggregations.Terms("language")
- if found {
- searchResultLanguages = make([]*SearchResultLanguages, 0, 10)
-
- for _, bucket := range agg.Buckets {
- searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
- Language: bucket.Key.(string),
- Color: enry.GetColor(bucket.Key.(string)),
- Count: int(bucket.DocCount),
- })
- }
- }
- return searchResultLanguages
- }
-
- // Search searches for codes and language stats by given conditions.
- func (b *ElasticSearchIndexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isMatch bool) (int64, []*SearchResult, []*SearchResultLanguages, error) {
- searchType := esMultiMatchTypeBestFields
- if isMatch {
- searchType = esMultiMatchTypePhrasePrefix
- }
-
- kwQuery := elastic.NewMultiMatchQuery(keyword, "content").Type(searchType)
- query := elastic.NewBoolQuery()
- query = query.Must(kwQuery)
- if len(repoIDs) > 0 {
- repoStrs := make([]interface{}, 0, len(repoIDs))
- for _, repoID := range repoIDs {
- repoStrs = append(repoStrs, repoID)
- }
- repoQuery := elastic.NewTermsQuery("repo_id", repoStrs...)
- query = query.Must(repoQuery)
- }
-
- var (
- start int
- kw = "<em>" + keyword + "</em>"
- aggregation = elastic.NewTermsAggregation().Field("language").Size(10).OrderByCountDesc()
- )
-
- if page > 0 {
- start = (page - 1) * pageSize
- }
-
- if len(language) == 0 {
- searchResult, err := b.client.Search().
- Index(b.indexerAliasName).
- Aggregation("language", aggregation).
- Query(query).
- Highlight(
- elastic.NewHighlight().
- Field("content").
- NumOfFragments(0). // return all highting content on fragments
- HighlighterType("fvh"),
- ).
- Sort("repo_id", true).
- From(start).Size(pageSize).
- Do(ctx)
- if err != nil {
- return 0, nil, nil, b.checkError(err)
- }
-
- return convertResult(searchResult, kw, pageSize)
- }
-
- langQuery := elastic.NewMatchQuery("language", language)
- countResult, err := b.client.Search().
- Index(b.indexerAliasName).
- Aggregation("language", aggregation).
- Query(query).
- Size(0). // We only needs stats information
- Do(ctx)
- if err != nil {
- return 0, nil, nil, b.checkError(err)
- }
-
- query = query.Must(langQuery)
- searchResult, err := b.client.Search().
- Index(b.indexerAliasName).
- Query(query).
- Highlight(
- elastic.NewHighlight().
- Field("content").
- NumOfFragments(0). // return all highting content on fragments
- HighlighterType("fvh"),
- ).
- Sort("repo_id", true).
- From(start).Size(pageSize).
- Do(ctx)
- if err != nil {
- return 0, nil, nil, b.checkError(err)
- }
-
- total, hits, _, err := convertResult(searchResult, kw, pageSize)
-
- return total, hits, extractAggs(countResult), err
- }
-
- // Close implements indexer
- func (b *ElasticSearchIndexer) Close() {
- select {
- case <-b.stopTimer:
- default:
- close(b.stopTimer)
- }
- }
-
- func (b *ElasticSearchIndexer) checkError(err error) error {
- var opErr *net.OpError
- if !(elastic.IsConnErr(err) || (errors.As(err, &opErr) && (opErr.Op == "dial" || opErr.Op == "read"))) {
- return err
- }
-
- b.setAvailability(false)
-
- return err
- }
-
- func (b *ElasticSearchIndexer) checkAvailability() {
- if b.Ping() {
- return
- }
-
- // Request cluster state to check if elastic is available again
- _, err := b.client.ClusterState().Do(graceful.GetManager().ShutdownContext())
- if err != nil {
- b.setAvailability(false)
- return
- }
-
- b.setAvailability(true)
- }
-
- func (b *ElasticSearchIndexer) setAvailability(available bool) {
- b.lock.Lock()
- defer b.lock.Unlock()
-
- if b.available == available {
- return
- }
-
- b.available = available
- }
|