From 44776f99b8d31fc26540f5bbb9281361df1a3c63 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 27 Aug 2019 18:20:59 +0100 Subject: [PATCH] [Fix] Fix normalization of non-alphabet based languages --- src/libstat/backends/redis_backend.c | 3 ++- src/libstat/tokenizers/tokenizers.c | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index d54767c12..baeb2308d 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -527,7 +527,8 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b_tokens %b %b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->stemmed.begin, tok->t1->stemmed.len); + tok->t1->stemmed.begin, + tok->t1->stemmed.len); } } else { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index f69378f9b..ffa1af9db 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -627,14 +627,10 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, } #endif - if (cat == U_UPPERCASE_LETTER || - cat == U_LOWERCASE_LETTER || - cat == U_DECIMAL_DIGIT_NUMBER || + if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) || cat == U_CONNECTOR_PUNCTUATION || cat == U_MATH_SYMBOL || - cat == U_CURRENCY_SYMBOL || - cat == U_INITIAL_PUNCTUATION || - cat == U_FINAL_PUNCTUATION) { + cat == U_CURRENCY_SYMBOL) { *d++ = u_tolower (t); } } -- 2.39.5