diff options
author | Lunny Xiao <xiaolunwen@gmail.com> | 2020-08-31 00:08:01 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-08-30 19:08:01 +0300 |
commit | 9bc69ff26eeebaf3b622d62d18c757ff1f401dda (patch) | |
tree | 69ff71d9d460e83a6fff54b172b604732ab5d065 /modules/indexer/code/bleve.go | |
parent | d257485bc0026c9717fe7bf4c9953ad1b7a1a9ae (diff) | |
download | gitea-9bc69ff26eeebaf3b622d62d18c757ff1f401dda.tar.gz gitea-9bc69ff26eeebaf3b622d62d18c757ff1f401dda.zip |
Support elastic search for code search (#10273)
* Support elastic search for code search
* Finished elastic search implementation and add some tests
* Enable test on drone and added docs
* Add new fields to elastic search
* Fix bug
* remove unused changes
* Use indexer alias to keep the gitea indexer version
* Improve codes
* Some code improvements
* The real indexer name changed to xxx.v1
Co-authored-by: zeripath <art27@cantab.net>
Diffstat (limited to 'modules/indexer/code/bleve.go')
-rw-r--r-- | modules/indexer/code/bleve.go | 132 |
1 files changed, 51 insertions, 81 deletions
diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go index 6502259ba4..81373bf3da 100644 --- a/modules/indexer/code/bleve.go +++ b/modules/indexer/code/bleve.go @@ -58,10 +58,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error { }) } -// openIndexer open the index at the specified path, checking for metadata +// openBleveIndexer open the index at the specified path, checking for metadata // updates and bleve version updates. If index needs to be created (or // re-created), returns (nil, nil) -func openIndexer(path string, latestVersion int) (bleve.Index, error) { +func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) { _, err := os.Stat(path) if err != nil && os.IsNotExist(err) { return nil, nil @@ -104,54 +104,14 @@ func (d *RepoIndexerData) Type() string { return repoIndexerDocType } -func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { - // Ignore vendored files in code search - if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { - return nil - } - stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha). - RunInDir(repo.RepoPath()) - if err != nil { - return err - } - if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil { - return fmt.Errorf("Misformatted git cat-file output: %v", err) - } else if int64(size) > setting.Indexer.MaxIndexerFileSize { - return addDelete(update.Filename, repo, batch) - } - - fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha). - RunInDirBytes(repo.RepoPath()) - if err != nil { - return err - } else if !base.IsTextFile(fileContents) { - // FIXME: UTF-16 files will probably fail here - return nil - } - - id := filenameIndexerID(repo.ID, update.Filename) - return batch.Index(id, &RepoIndexerData{ - RepoID: repo.ID, - CommitID: commitSha, - Content: string(charset.ToUTF8DropErrors(fileContents)), - Language: analyze.GetCodeLanguage(update.Filename, fileContents), - UpdatedAt: time.Now().UTC(), - }) -} - -func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error { - id := filenameIndexerID(repo.ID, filename) - return batch.Delete(id) -} - const ( repoIndexerAnalyzer = "repoIndexerAnalyzer" repoIndexerDocType = "repoIndexerDocType" repoIndexerLatestVersion = 5 ) -// createRepoIndexer create a repo indexer if one does not already exist -func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) { +// createBleveIndexer create a bleve repo indexer if one does not already exist +func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) { docMapping := bleve.NewDocumentMapping() numericFieldMapping := bleve.NewNumericFieldMapping() numericFieldMapping.IncludeInAll = false @@ -199,18 +159,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) { return indexer, nil } -func filenameIndexerID(repoID int64, filename string) string { - return indexerID(repoID) + "_" + filename -} - -func filenameOfIndexerID(indexerID string) string { - index := strings.IndexByte(indexerID, '_') - if index == -1 { - log.Error("Unexpected ID in repo indexer: %s", indexerID) - } - return indexerID[index+1:] -} - var ( _ Indexer = &BleveIndexer{} ) @@ -230,10 +178,51 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) { return indexer, created, err } +func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { + // Ignore vendored files in code search + if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + return nil + } + + stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha). + RunInDir(repo.RepoPath()) + if err != nil { + return err + } + if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil { + return fmt.Errorf("Misformatted git cat-file output: %v", err) + } else if int64(size) > setting.Indexer.MaxIndexerFileSize { + return b.addDelete(update.Filename, repo, batch) + } + + fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha). + RunInDirBytes(repo.RepoPath()) + if err != nil { + return err + } else if !base.IsTextFile(fileContents) { + // FIXME: UTF-16 files will probably fail here + return nil + } + + id := filenameIndexerID(repo.ID, update.Filename) + return batch.Index(id, &RepoIndexerData{ + RepoID: repo.ID, + CommitID: commitSha, + Content: string(charset.ToUTF8DropErrors(fileContents)), + Language: analyze.GetCodeLanguage(update.Filename, fileContents), + UpdatedAt: time.Now().UTC(), + }) +} + +func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error { + id := filenameIndexerID(repo.ID, filename) + return batch.Delete(id) +} + // init init the indexer func (b *BleveIndexer) init() (bool, error) { var err error - b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion) + b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion) if err != nil { return false, err } @@ -241,7 +230,7 @@ func (b *BleveIndexer) init() (bool, error) { return false, nil } - b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion) + b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion) if err != nil { return false, err } @@ -262,38 +251,19 @@ func (b *BleveIndexer) Close() { } // Index indexes the data -func (b *BleveIndexer) Index(repoID int64) error { - repo, err := models.GetRepositoryByID(repoID) - if err != nil { - return err - } - - sha, err := getDefaultBranchSha(repo) - if err != nil { - return err - } - changes, err := getRepoChanges(repo, sha) - if err != nil { - return err - } else if changes == nil { - return nil - } - +func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error { batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize) for _, update := range changes.Updates { - if err := addUpdate(sha, update, repo, batch); err != nil { + if err := b.addUpdate(sha, update, repo, batch); err != nil { return err } } for _, filename := range changes.RemovedFilenames { - if err := addDelete(filename, repo, batch); err != nil { + if err := b.addDelete(filename, repo, batch); err != nil { return err } } - if err = batch.Flush(); err != nil { - return err - } - return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha) + return batch.Flush() } // Delete deletes indexes by ids |