diff options
author | Lunny Xiao <xiaolunwen@gmail.com> | 2020-09-12 20:31:52 +0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-09-12 20:31:52 +0800 |
commit | 8ce10fb6e188af85e1fee01cb25a380769a2ad25 (patch) | |
tree | 65d0e2f192dd3d90991925a70d19b08363e600b1 /modules/indexer | |
parent | ae528d832133e43c7465717a4f7a442e84d0ea9a (diff) | |
download | gitea-8ce10fb6e188af85e1fee01cb25a380769a2ad25.tar.gz gitea-8ce10fb6e188af85e1fee01cb25a380769a2ad25.zip |
Fix bug on elastic search (#12811)
* Fix bug on elastic search
* Add more comments for elastic search result startIndex and endIndex
* refactor indexPos
* refactor indexPos
* Fix bug
Diffstat (limited to 'modules/indexer')
-rw-r--r-- | modules/indexer/code/elastic_search.go | 51 | ||||
-rw-r--r-- | modules/indexer/code/elastic_search_test.go | 6 |
2 files changed, 42 insertions, 15 deletions
diff --git a/modules/indexer/code/elastic_search.go b/modules/indexer/code/elastic_search.go index db36c5e0c4..08b20b80a0 100644 --- a/modules/indexer/code/elastic_search.go +++ b/modules/indexer/code/elastic_search.go @@ -90,6 +90,7 @@ const ( }, "content": { "type": "text", + "term_vector": "with_positions_offsets", "index": true }, "commit_id": { @@ -251,6 +252,22 @@ func (b *ElasticSearchIndexer) Delete(repoID int64) error { return err } +// indexPos find words positions for start and the following end on content. It will +// return the beginning position of the frist start and the ending position of the +// first end following the start string. +// If not found any of the positions, it will return -1, -1. +func indexPos(content, start, end string) (int, int) { + startIdx := strings.Index(content, start) + if startIdx < 0 { + return -1, -1 + } + endIdx := strings.Index(content[startIdx+len(start):], end) + if endIdx < 0 { + return -1, -1 + } + return startIdx, startIdx + len(start) + endIdx + len(end) +} + func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) { hits := make([]*SearchResult, 0, pageSize) for _, hit := range searchResult.Hits.Hits { @@ -260,18 +277,12 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) var startIndex, endIndex int = -1, -1 c, ok := hit.Highlight["content"] if ok && len(c) > 0 { - var subStr = make([]rune, 0, len(kw)) - startIndex = strings.IndexFunc(c[0], func(r rune) bool { - if len(subStr) >= len(kw) { - subStr = subStr[1:] - } - subStr = append(subStr, r) - return strings.EqualFold(kw, string(subStr)) - }) - if startIndex > -1 { - endIndex = startIndex + len(kw) - } else { - panic(fmt.Sprintf("1===%#v", hit.Highlight)) + // FIXME: Since the high lighting content will include <em> and </em> for the keywords, + // now we should find the poisitions. But how to avoid html content which contains the + // <em> and </em> tags? If elastic search has handled that? + startIndex, endIndex = indexPos(c[0], "<em>", "</em>") + if startIndex == -1 { + panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) } } else { panic(fmt.Sprintf("2===%#v", hit.Highlight)) @@ -293,7 +304,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), Language: language, StartIndex: startIndex, - EndIndex: endIndex, + EndIndex: endIndex - 9, // remove the length <em></em> since we give Content the original data Color: enry.GetColor(language), }) } @@ -347,7 +358,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, Index(b.indexerAliasName). Aggregation("language", aggregation). Query(query). - Highlight(elastic.NewHighlight().Field("content")). + Highlight( + elastic.NewHighlight(). + Field("content"). + NumOfFragments(0). // return all highting content on fragments + HighlighterType("fvh"), + ). Sort("repo_id", true). From(start).Size(pageSize). Do(context.Background()) @@ -373,7 +389,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, searchResult, err := b.client.Search(). Index(b.indexerAliasName). Query(query). - Highlight(elastic.NewHighlight().Field("content")). + Highlight( + elastic.NewHighlight(). + Field("content"). + NumOfFragments(0). // return all highting content on fragments + HighlighterType("fvh"), + ). Sort("repo_id", true). From(start).Size(pageSize). Do(context.Background()) diff --git a/modules/indexer/code/elastic_search_test.go b/modules/indexer/code/elastic_search_test.go index a230939746..7cf62e0c5f 100644 --- a/modules/indexer/code/elastic_search_test.go +++ b/modules/indexer/code/elastic_search_test.go @@ -34,3 +34,9 @@ func TestESIndexAndSearch(t *testing.T) { testIndexer("elastic_search", t, indexer) } + +func TestIndexPos(t *testing.T) { + startIdx, endIdx := indexPos("test index start and end", "start", "end") + assert.EqualValues(t, 11, startIdx) + assert.EqualValues(t, 24, endIdx) +} |