summaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
authorJason Song <i@wolfogre.com>2023-07-04 17:05:28 +0800
committerGitHub <noreply@github.com>2023-07-04 09:05:28 +0000
commit9958642502f8b505f97589d7a7f5357e8dfc04e3 (patch)
treee9731af468ee4a4c9a600a1a51e44eb41dfb2f1d /modules
parentdae022ab2a25e82cf89027f865c142ebdff0b5ea (diff)
downloadgitea-9958642502f8b505f97589d7a7f5357e8dfc04e3.tar.gz
gitea-9958642502f8b505f97589d7a7f5357e8dfc04e3.zip
Fix issues indexer document mapping (#25619)
Fix regression of #5363 (so long ago). The old code definded a document mapping for `issueIndexerDocType`, and assigned it to `BleveIndexerData` as its type. (`BleveIndexerData` has been renamed to `IndexerData` in #25174, but nothing more.) But the old code never used `BleveIndexerData`, it wrote the index with an anonymous struct type. Nonetheless, bleve would use the default auto-mapping for struct it didn't know, so the indexer still worked. This means the custom document mapping was always dead code. The custom document mapping is not useless, it can reduce index storage, this PR brings it back and disable default mapping to prevent it from happening again. Since `IndexerData`(`BleveIndexerData`) has JSON tags, and bleve uses them first, so we should use `repo_id` as the field name instead of `RepoID`. I did a test to compare the storage size before and after this, with about 3k real comments that were migrated from some public repos. Before: ```text [ 160] . ├── [ 42] index_meta.json ├── [ 13] rupture_meta.json └── [ 128] store ├── [6.9M] 00000000005d.zap └── [256K] root.bolt ``` After: ```text [ 160] . ├── [ 42] index_meta.json ├── [ 13] rupture_meta.json └── [ 128] store ├── [3.5M] 000000000065.zap └── [256K] root.bolt ``` It saves about half the storage space. --------- Co-authored-by: Giteabot <teabot@gitea.io>
Diffstat (limited to 'modules')
-rw-r--r--modules/indexer/issues/bleve/bleve.go32
1 files changed, 12 insertions, 20 deletions
diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go
index bb0bc4b04a..50a2306ee6 100644
--- a/modules/indexer/issues/bleve/bleve.go
+++ b/modules/indexer/issues/bleve/bleve.go
@@ -23,7 +23,7 @@ import (
const (
issueIndexerAnalyzer = "issueIndexer"
issueIndexerDocType = "issueIndexerDocType"
- issueIndexerLatestVersion = 2
+ issueIndexerLatestVersion = 3
)
// numericEqualityQuery a numeric equality query for the given value and field
@@ -67,15 +67,16 @@ func generateIssueIndexMapping() (mapping.IndexMapping, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
+ numericFieldMapping.Store = false
numericFieldMapping.IncludeInAll = false
- docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
+ docMapping.AddFieldMappingsAt("repo_id", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.Store = false
textFieldMapping.IncludeInAll = false
- docMapping.AddFieldMappingsAt("Title", textFieldMapping)
- docMapping.AddFieldMappingsAt("Content", textFieldMapping)
- docMapping.AddFieldMappingsAt("Comments", textFieldMapping)
+ docMapping.AddFieldMappingsAt("title", textFieldMapping)
+ docMapping.AddFieldMappingsAt("content", textFieldMapping)
+ docMapping.AddFieldMappingsAt("comments", textFieldMapping)
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
@@ -91,6 +92,7 @@ func generateIssueIndexMapping() (mapping.IndexMapping, error) {
mapping.DefaultAnalyzer = issueIndexerAnalyzer
mapping.AddDocumentMapping(issueIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
+ mapping.DefaultMapping = bleve.NewDocumentDisabledMapping() // disable default mapping, avoid indexing unexpected structs
return mapping, nil
}
@@ -116,17 +118,7 @@ func NewIndexer(indexDir string) *Indexer {
func (b *Indexer) Index(_ context.Context, issues []*internal.IndexerData) error {
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
for _, issue := range issues {
- if err := batch.Index(indexer_internal.Base36(issue.ID), struct {
- RepoID int64
- Title string
- Content string
- Comments []string
- }{
- RepoID: issue.RepoID,
- Title: issue.Title,
- Content: issue.Content,
- Comments: issue.Comments,
- }); err != nil {
+ if err := batch.Index(indexer_internal.Base36(issue.ID), (*IndexerData)(issue)); err != nil {
return err
}
}
@@ -149,7 +141,7 @@ func (b *Indexer) Delete(_ context.Context, ids ...int64) error {
func (b *Indexer) Search(ctx context.Context, keyword string, repoIDs []int64, limit, start int) (*internal.SearchResult, error) {
var repoQueriesP []*query.NumericRangeQuery
for _, repoID := range repoIDs {
- repoQueriesP = append(repoQueriesP, numericEqualityQuery(repoID, "RepoID"))
+ repoQueriesP = append(repoQueriesP, numericEqualityQuery(repoID, "repo_id"))
}
repoQueries := make([]query.Query, len(repoQueriesP))
for i, v := range repoQueriesP {
@@ -159,9 +151,9 @@ func (b *Indexer) Search(ctx context.Context, keyword string, repoIDs []int64, l
indexerQuery := bleve.NewConjunctionQuery(
bleve.NewDisjunctionQuery(repoQueries...),
bleve.NewDisjunctionQuery(
- newMatchPhraseQuery(keyword, "Title", issueIndexerAnalyzer),
- newMatchPhraseQuery(keyword, "Content", issueIndexerAnalyzer),
- newMatchPhraseQuery(keyword, "Comments", issueIndexerAnalyzer),
+ newMatchPhraseQuery(keyword, "title", issueIndexerAnalyzer),
+ newMatchPhraseQuery(keyword, "content", issueIndexerAnalyzer),
+ newMatchPhraseQuery(keyword, "comments", issueIndexerAnalyzer),
))
search := bleve.NewSearchRequestOptions(indexerQuery, limit, start, false)
search.SortBy([]string{"-_score"})