const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
- // fuzzyDenominator determines the levenshtein distance per each character of a keyword
- fuzzyDenominator = 4
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
if opts.IsKeywordFuzzy {
- phraseQuery.Fuzziness = len(opts.Keyword) / fuzzyDenominator
+ phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}
if len(opts.RepoIDs) > 0 {
return index, 0, nil
}
+
+func GuessFuzzinessByKeyword(s string) int {
+ // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
+ // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
+ // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
+ for _, r := range s {
+ if r >= 128 {
+ return 0
+ }
+ }
+ return min(2, len(s)/4)
+}
})
}
-const (
- maxBatchSize = 16
- // fuzzyDenominator determines the levenshtein distance per each character of a keyword
- fuzzyDenominator = 4
-)
+const maxBatchSize = 16
// IndexerData an update to the issue indexer
type IndexerData internal.IndexerData
if options.Keyword != "" {
fuzziness := 0
if options.IsFuzzyKeyword {
- fuzziness = len(options.Keyword) / fuzzyDenominator
+ fuzziness = inner_bleve.GuessFuzzinessByKeyword(options.Keyword)
}
queries = append(queries, bleve.NewDisjunctionQuery([]query.Query{
ctx.Data["Language"] = language
ctx.Data["IsFuzzy"] = isFuzzy
ctx.Data["PageIsViewCode"] = true
+ ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
if keyword == "" {
ctx.HTML(http.StatusOK, tplSearch)
}
}
- ctx.Data["IsRepoIndexerEnabled"] = setting.Indexer.RepoIndexerEnabled
ctx.Data["Repo"] = ctx.Repo.Repository
ctx.Data["SearchResults"] = searchResults
ctx.Data["SearchResultLanguages"] = searchResultLanguages