aboutsummaryrefslogtreecommitdiffstats
path: root/modules/indexer/code/bleve.go
diff options
context:
space:
mode:
authorLunny Xiao <xiaolunwen@gmail.com>2020-08-31 00:08:01 +0800
committerGitHub <noreply@github.com>2020-08-30 19:08:01 +0300
commit9bc69ff26eeebaf3b622d62d18c757ff1f401dda (patch)
tree69ff71d9d460e83a6fff54b172b604732ab5d065 /modules/indexer/code/bleve.go
parentd257485bc0026c9717fe7bf4c9953ad1b7a1a9ae (diff)
downloadgitea-9bc69ff26eeebaf3b622d62d18c757ff1f401dda.tar.gz
gitea-9bc69ff26eeebaf3b622d62d18c757ff1f401dda.zip
Support elastic search for code search (#10273)
* Support elastic search for code search * Finished elastic search implementation and add some tests * Enable test on drone and added docs * Add new fields to elastic search * Fix bug * remove unused changes * Use indexer alias to keep the gitea indexer version * Improve codes * Some code improvements * The real indexer name changed to xxx.v1 Co-authored-by: zeripath <art27@cantab.net>
Diffstat (limited to 'modules/indexer/code/bleve.go')
-rw-r--r--modules/indexer/code/bleve.go132
1 files changed, 51 insertions, 81 deletions
diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go
index 6502259ba4..81373bf3da 100644
--- a/modules/indexer/code/bleve.go
+++ b/modules/indexer/code/bleve.go
@@ -58,10 +58,10 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}
-// openIndexer open the index at the specified path, checking for metadata
+// openBleveIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
-func openIndexer(path string, latestVersion int) (bleve.Index, error) {
+func openBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(path)
if err != nil && os.IsNotExist(err) {
return nil, nil
@@ -104,54 +104,14 @@ func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
-func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
- // Ignore vendored files in code search
- if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
- return nil
- }
- stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
- RunInDir(repo.RepoPath())
- if err != nil {
- return err
- }
- if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
- return fmt.Errorf("Misformatted git cat-file output: %v", err)
- } else if int64(size) > setting.Indexer.MaxIndexerFileSize {
- return addDelete(update.Filename, repo, batch)
- }
-
- fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
- RunInDirBytes(repo.RepoPath())
- if err != nil {
- return err
- } else if !base.IsTextFile(fileContents) {
- // FIXME: UTF-16 files will probably fail here
- return nil
- }
-
- id := filenameIndexerID(repo.ID, update.Filename)
- return batch.Index(id, &RepoIndexerData{
- RepoID: repo.ID,
- CommitID: commitSha,
- Content: string(charset.ToUTF8DropErrors(fileContents)),
- Language: analyze.GetCodeLanguage(update.Filename, fileContents),
- UpdatedAt: time.Now().UTC(),
- })
-}
-
-func addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
- id := filenameIndexerID(repo.ID, filename)
- return batch.Delete(id)
-}
-
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 5
)
-// createRepoIndexer create a repo indexer if one does not already exist
-func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
+// createBleveIndexer create a bleve repo indexer if one does not already exist
+func createBleveIndexer(path string, latestVersion int) (bleve.Index, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
@@ -199,18 +159,6 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
return indexer, nil
}
-func filenameIndexerID(repoID int64, filename string) string {
- return indexerID(repoID) + "_" + filename
-}
-
-func filenameOfIndexerID(indexerID string) string {
- index := strings.IndexByte(indexerID, '_')
- if index == -1 {
- log.Error("Unexpected ID in repo indexer: %s", indexerID)
- }
- return indexerID[index+1:]
-}
-
var (
_ Indexer = &BleveIndexer{}
)
@@ -230,10 +178,51 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) {
return indexer, created, err
}
+func (b *BleveIndexer) addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
+ // Ignore vendored files in code search
+ if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
+ return nil
+ }
+
+ stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
+ RunInDir(repo.RepoPath())
+ if err != nil {
+ return err
+ }
+ if size, err := strconv.Atoi(strings.TrimSpace(stdout)); err != nil {
+ return fmt.Errorf("Misformatted git cat-file output: %v", err)
+ } else if int64(size) > setting.Indexer.MaxIndexerFileSize {
+ return b.addDelete(update.Filename, repo, batch)
+ }
+
+ fileContents, err := git.NewCommand("cat-file", "blob", update.BlobSha).
+ RunInDirBytes(repo.RepoPath())
+ if err != nil {
+ return err
+ } else if !base.IsTextFile(fileContents) {
+ // FIXME: UTF-16 files will probably fail here
+ return nil
+ }
+
+ id := filenameIndexerID(repo.ID, update.Filename)
+ return batch.Index(id, &RepoIndexerData{
+ RepoID: repo.ID,
+ CommitID: commitSha,
+ Content: string(charset.ToUTF8DropErrors(fileContents)),
+ Language: analyze.GetCodeLanguage(update.Filename, fileContents),
+ UpdatedAt: time.Now().UTC(),
+ })
+}
+
+func (b *BleveIndexer) addDelete(filename string, repo *models.Repository, batch rupture.FlushingBatch) error {
+ id := filenameIndexerID(repo.ID, filename)
+ return batch.Delete(id)
+}
+
// init init the indexer
func (b *BleveIndexer) init() (bool, error) {
var err error
- b.indexer, err = openIndexer(b.indexDir, repoIndexerLatestVersion)
+ b.indexer, err = openBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
@@ -241,7 +230,7 @@ func (b *BleveIndexer) init() (bool, error) {
return false, nil
}
- b.indexer, err = createRepoIndexer(b.indexDir, repoIndexerLatestVersion)
+ b.indexer, err = createBleveIndexer(b.indexDir, repoIndexerLatestVersion)
if err != nil {
return false, err
}
@@ -262,38 +251,19 @@ func (b *BleveIndexer) Close() {
}
// Index indexes the data
-func (b *BleveIndexer) Index(repoID int64) error {
- repo, err := models.GetRepositoryByID(repoID)
- if err != nil {
- return err
- }
-
- sha, err := getDefaultBranchSha(repo)
- if err != nil {
- return err
- }
- changes, err := getRepoChanges(repo, sha)
- if err != nil {
- return err
- } else if changes == nil {
- return nil
- }
-
+func (b *BleveIndexer) Index(repo *models.Repository, sha string, changes *repoChanges) error {
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
for _, update := range changes.Updates {
- if err := addUpdate(sha, update, repo, batch); err != nil {
+ if err := b.addUpdate(sha, update, repo, batch); err != nil {
return err
}
}
for _, filename := range changes.RemovedFilenames {
- if err := addDelete(filename, repo, batch); err != nil {
+ if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
- if err = batch.Flush(); err != nil {
- return err
- }
- return repo.UpdateIndexerStatus(models.RepoIndexerTypeCode, sha)
+ return batch.Flush()
}
// Delete deletes indexes by ids