From a7f5e5eb06168374f1ee25b744e4b37f1ad4c8a0 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 27 Aug 2019 18:20:59 +0100 Subject: [PATCH] [Fix] Fix normalization of non-alphabet based languages --- src/libstat/backends/redis_backend.c | 3 ++- src/libstat/tokenizers/tokenizers.c | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index 9ac6fb445..9dd3624fb 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -526,7 +526,8 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b_tokens %b %b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->stemmed.begin, tok->t1->stemmed.len); + tok->t1->stemmed.begin, + tok->t1->stemmed.len); } } else { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index ea3c84c67..000f2033c 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -679,14 +679,10 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, } #endif - if (cat == U_UPPERCASE_LETTER || - cat == U_LOWERCASE_LETTER || - cat == U_DECIMAL_DIGIT_NUMBER || + if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) || cat == U_CONNECTOR_PUNCTUATION || cat == U_MATH_SYMBOL || - cat == U_CURRENCY_SYMBOL || - cat == U_INITIAL_PUNCTUATION || - cat == U_FINAL_PUNCTUATION) { + cat == U_CURRENCY_SYMBOL) { *d++ = u_tolower (t); } } -- 2.39.5