summaryrefslogtreecommitdiffstats
path: root/modules
diff options
context:
space:
mode:
authorGiteabot <teabot@gitea.io>2024-05-01 20:59:59 +0800
committerGitHub <noreply@github.com>2024-05-01 12:59:59 +0000
commit97a7c04a8fc4747d32af84fca3d068425ab33768 (patch)
treee895d08c0bad5bc1be0ce282af4689bbe7239b3c /modules
parent99e89e57bc4d2b3a3cd6c3068f818a0c7e8cec28 (diff)
downloadgitea-97a7c04a8fc4747d32af84fca3d068425ab33768.tar.gz
gitea-97a7c04a8fc4747d32af84fca3d068425ab33768.zip
Fix bleve fuzziness (#30799) (#30804)
Backport #30799 by wxiaoguang Fix #30797 Fix #30317 Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
Diffstat (limited to 'modules')
-rw-r--r--modules/indexer/code/bleve/bleve.go4
-rw-r--r--modules/indexer/internal/bleve/util.go12
-rw-r--r--modules/indexer/issues/bleve/bleve.go8
3 files changed, 15 insertions, 9 deletions
diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go
index bd844205a6..8056b58ec2 100644
--- a/modules/indexer/code/bleve/bleve.go
+++ b/modules/indexer/code/bleve/bleve.go
@@ -39,8 +39,6 @@ import (
const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
- // fuzzyDenominator determines the levenshtein distance per each character of a keyword
- fuzzyDenominator = 4
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
@@ -245,7 +243,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
if opts.IsKeywordFuzzy {
- phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator
+ phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}
if len(opts.RepoIDs) > 0 {
diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go
index 43a7c3c5ec..a2265f86e6 100644
--- a/modules/indexer/internal/bleve/util.go
+++ b/modules/indexer/internal/bleve/util.go
@@ -47,3 +47,15 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil
}
+
+func GuessFuzzinessByKeyword(s string) int {
+ // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
+ // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
+ // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
+ for _, r := range s {
+ if r >= 128 {
+ return 0
+ }
+ }
+ return min(2, len(s)/4)
+}
diff --git a/modules/indexer/issues/bleve/bleve.go b/modules/indexer/issues/bleve/bleve.go
index 1f54be721b..d7957b266a 100644
--- a/modules/indexer/issues/bleve/bleve.go
+++ b/modules/indexer/issues/bleve/bleve.go
@@ -35,11 +35,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
})
}
-const (
- maxBatchSize = 16
- // fuzzyDenominator determines the levenshtein distance per each character of a keyword
- fuzzyDenominator = 4
-)
+const maxBatchSize = 16
// IndexerData an update to the issue indexer
type IndexerData internal.IndexerData
@@ -162,7 +158,7 @@ func (b *Indexer) Search(ctx context.Context, options *internal.SearchOptions) (
if options.Keyword != "" {
fuzziness := 0
if options.IsFuzzyKeyword {
- fuzziness = len(options.Keyword) / fuzzyDenominator
+ fuzziness = inner_bleve.GuessFuzzinessByKeyword(options.Keyword)
}
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{