Fix bug on elastic search (#12811)

* Fix bug on elastic search * Add more comments for elastic search result startIndex and endIndex * refactor indexPos * refactor indexPos * Fix bug
author: Lunny Xiao <xiaolunwen@gmail.com> 2020-09-12 20:31:52 +0800
committer: GitHub <noreply@github.com> 2020-09-12 20:31:52 +0800
commit: 8ce10fb6e188af85e1fee01cb25a380769a2ad25 (patch)
tree: 65d0e2f192dd3d90991925a70d19b08363e600b1 /modules/indexer
parent: ae528d832133e43c7465717a4f7a442e84d0ea9a (diff)
download: gitea-8ce10fb6e188af85e1fee01cb25a380769a2ad25.tar.gz
gitea-8ce10fb6e188af85e1fee01cb25a380769a2ad25.zip
2 files changed, 42 insertions, 15 deletions
diff --git a/modules/indexer/code/elastic_search.go b/modules/indexer/code/elastic_search.go
index db36c5e0c4..08b20b80a0 100644
--- a/modules/indexer/code/elastic_search.go
+++ b/modules/indexer/code/elastic_search.go
@@ -90,6 +90,7 @@ const (
 				},
 				"content": {
 					"type": "text",
+					"term_vector": "with_positions_offsets",
 					"index": true
 				},
 				"commit_id": {
@@ -251,6 +252,22 @@ func (b *ElasticSearchIndexer) Delete(repoID int64) error {
 	return err
 }
 
+// indexPos find words positions for start and the following end on content. It will
+// return the beginning position of the frist start and the ending position of the
+// first end following the start string.
+// If not found any of the positions, it will return -1, -1.
+func indexPos(content, start, end string) (int, int) {
+	startIdx := strings.Index(content, start)
+	if startIdx < 0 {
+		return -1, -1
+	}
+	endIdx := strings.Index(content[startIdx+len(start):], end)
+	if endIdx < 0 {
+		return -1, -1
+	}
+	return startIdx, startIdx + len(start) + endIdx + len(end)
+}
+
 func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
 	hits := make([]*SearchResult, 0, pageSize)
 	for _, hit := range searchResult.Hits.Hits {
@@ -260,18 +277,12 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
 		var startIndex, endIndex int = -1, -1
 		c, ok := hit.Highlight["content"]
 		if ok && len(c) > 0 {
-			var subStr = make([]rune, 0, len(kw))
-			startIndex = strings.IndexFunc(c[0], func(r rune) bool {
-				if len(subStr) >= len(kw) {
-					subStr = subStr[1:]
-				}
-				subStr = append(subStr, r)
-				return strings.EqualFold(kw, string(subStr))
-			})
-			if startIndex > -1 {
-				endIndex = startIndex + len(kw)
-			} else {
-				panic(fmt.Sprintf("1===%#v", hit.Highlight))
+			// FIXME: Since the high lighting content will include <em> and </em> for the keywords,
+			// now we should find the poisitions. But how to avoid html content which contains the
+			// <em> and </em> tags? If elastic search has handled that?
+			startIndex, endIndex = indexPos(c[0], "<em>", "</em>")
+			if startIndex == -1 {
+				panic(fmt.Sprintf("1===%s,,,%#v,,,%s", kw, hit.Highlight, c[0]))
 			}
 		} else {
 			panic(fmt.Sprintf("2===%#v", hit.Highlight))
@@ -293,7 +304,7 @@ func convertResult(searchResult *elastic.SearchResult, kw string, pageSize int)
 			UpdatedUnix: timeutil.TimeStamp(res["updated_at"].(float64)),
 			Language:    language,
 			StartIndex:  startIndex,
-			EndIndex:    endIndex,
+			EndIndex:    endIndex - 9, // remove the length <em></em> since we give Content the original data
 			Color:       enry.GetColor(language),
 		})
 	}
@@ -347,7 +358,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string,
 			Index(b.indexerAliasName).
 			Aggregation("language", aggregation).
 			Query(query).
-			Highlight(elastic.NewHighlight().Field("content")).
+			Highlight(
+				elastic.NewHighlight().
+					Field("content").
+					NumOfFragments(0). // return all highting content on fragments
+					HighlighterType("fvh"),
+			).
 			Sort("repo_id", true).
 			From(start).Size(pageSize).
 			Do(context.Background())
@@ -373,7 +389,12 @@ func (b *ElasticSearchIndexer) Search(repoIDs []int64, language, keyword string,
 	searchResult, err := b.client.Search().
 		Index(b.indexerAliasName).
 		Query(query).
-		Highlight(elastic.NewHighlight().Field("content")).
+		Highlight(
+			elastic.NewHighlight().
+				Field("content").
+				NumOfFragments(0). // return all highting content on fragments
+				HighlighterType("fvh"),
+		).
 		Sort("repo_id", true).
 		From(start).Size(pageSize).
 		Do(context.Background())
diff --git a/modules/indexer/code/elastic_search_test.go b/modules/indexer/code/elastic_search_test.go
index a230939746..7cf62e0c5f 100644
--- a/modules/indexer/code/elastic_search_test.go
+++ b/modules/indexer/code/elastic_search_test.go
@@ -34,3 +34,9 @@ func TestESIndexAndSearch(t *testing.T) {
 
 	testIndexer("elastic_search", t, indexer)
 }
+
+func TestIndexPos(t *testing.T) {
+	startIdx, endIdx := indexPos("test index start and end", "start", "end")
+	assert.EqualValues(t, 11, startIdx)
+	assert.EqualValues(t, 24, endIdx)
+}
author	Lunny Xiao <xiaolunwen@gmail.com>	2020-09-12 20:31:52 +0800
committer	GitHub <noreply@github.com>	2020-09-12 20:31:52 +0800
commit	8ce10fb6e188af85e1fee01cb25a380769a2ad25 (patch)
tree	65d0e2f192dd3d90991925a70d19b08363e600b1 /modules/indexer
parent	ae528d832133e43c7465717a4f7a442e84d0ea9a (diff)
download	gitea-8ce10fb6e188af85e1fee01cb25a380769a2ad25.tar.gz gitea-8ce10fb6e188af85e1fee01cb25a380769a2ad25.zip