From 8ce10fb6e188af85e1fee01cb25a380769a2ad25 Mon Sep 17 00:00:00 2001 From: Lunny Xiao Date: Sat, 12 Sep 2020 20:31:52 +0800 Subject: Fix bug on elastic search (#12811) * Fix bug on elastic search * Add more comments for elastic search result startIndex and endIndex * refactor indexPos * refactor indexPos * Fix bug --- modules/indexer/code/elastic_search.go | 51 ++++++++++++++++++++--------- modules/indexer/code/elastic_search_test.go | 6 ++++ 2 files changed, 42 insertions(+), 15 deletions(-) (limited to 'modules/indexer/code') diff --git a/modules/indexer/code/elastic_search.go b/modules/indexer/code/elastic_search.go index db36c5e0c4..08b20b80a0 100644 --- a/modules/indexer/code/elastic_search.go +++ b/modules/indexer/code/elastic_search.go @@ -90,6 +90,7 @@ const ( }, "content": { "type": "text", + "term_vector": "with_positions_offsets", "index": true }, "commit_id": { @@ -251,6 +252,22 @@ func (b *ElasticSearchIndexer) Delete(repoID int64) error { return err } +// indexPos find words positions for start and the following end on content. It will +// return the beginning position of the frist start and the ending position of the +// first end following the start string. +// If not found any of the positions, it will return -1, -1. +func indexPos(content, start, end string) (int, int) { + startIdx := strings.Index(content, start) + if startIdx < 0 { + return -1, -1 + } + endIdx := strings.Index(content[startIdx+len(start):], end) + if endIdx < 0 { + return -1, -1 + } + return startIdx, startIdx + len(start) + endIdx + len(end) +} + func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) { hits := make([]*SearchResult, 0, pageSize) for _, hit := range searchResult.Hits.Hits { @@ -260,18 +277,12 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) var startIndex, endIndex int = -1, -1 c, ok := hit.Highlight["content"] if ok && len(c) > 0 { - var subStr = make([]rune, 0, len(kw)) - startIndex = strings.IndexFunc(c[0], func(r rune) bool { - if len(subStr) >= len(kw) { - subStr = subStr[1:] - } - subStr = append(subStr, r) - return strings.EqualFold(kw, string(subStr)) - }) - if startIndex > -1 { - endIndex = startIndex + len(kw) - } else { - panic(fmt.Sprintf("1===%#v", hit.Highlight)) + // FIXME: Since the high lighting content will include and for the keywords, + // now we should find the poisitions. But how to avoid html content which contains the + // and tags? If elastic search has handled that? + startIndex, endIndex = indexPos(c[0], "", "") + if startIndex == -1 { + panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0])) } } else { panic(fmt.Sprintf("2===%#v", hit.Highlight)) @@ -293,7 +304,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)), Language: language, StartIndex: startIndex, - EndIndex: endIndex, + EndIndex: endIndex - 9, // remove the length since we give Content the original data Color: enry.GetColor(language), }) } @@ -347,7 +358,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, Index(b.indexerAliasName). Aggregation("language", aggregation). Query(query). - Highlight(elastic.NewHighlight().Field("content")). + Highlight( + elastic.NewHighlight(). + Field("content"). + NumOfFragments(0). // return all highting content on fragments + HighlighterType("fvh"), + ). Sort("repo_id", true). From(start).Size(pageSize). Do(context.Background()) @@ -373,7 +389,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string, searchResult, err := b.client.Search(). Index(b.indexerAliasName). Query(query). - Highlight(elastic.NewHighlight().Field("content")). + Highlight( + elastic.NewHighlight(). + Field("content"). + NumOfFragments(0). // return all highting content on fragments + HighlighterType("fvh"), + ). Sort("repo_id", true). From(start).Size(pageSize). Do(context.Background()) diff --git a/modules/indexer/code/elastic_search_test.go b/modules/indexer/code/elastic_search_test.go index a230939746..7cf62e0c5f 100644 --- a/modules/indexer/code/elastic_search_test.go +++ b/modules/indexer/code/elastic_search_test.go @@ -34,3 +34,9 @@ func TestESIndexAndSearch(t *testing.T) { testIndexer("elastic_search", t, indexer) } + +func TestIndexPos(t *testing.T) { + startIdx, endIdx := indexPos("test index start and end", "start", "end") + assert.EqualValues(t, 11, startIdx) + assert.EqualValues(t, 24, endIdx) +} -- cgit v1.2.3