From: Vsevolod Stakhov Date: Tue, 27 Aug 2019 17:20:59 +0000 (+0100) Subject: [Fix] Fix normalization of non-alphabet based languages X-Git-Tag: 2.0~332 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=a7f5e5eb06168374f1ee25b744e4b37f1ad4c8a0;p=rspamd.git [Fix] Fix normalization of non-alphabet based languages --- diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index 9ac6fb445..9dd3624fb 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -526,7 +526,8 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b_tokens %b %b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->stemmed.begin, tok->t1->stemmed.len); + tok->t1->stemmed.begin, + tok->t1->stemmed.len); } } else { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index ea3c84c67..000f2033c 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -679,14 +679,10 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, } #endif - if (cat == U_UPPERCASE_LETTER || - cat == U_LOWERCASE_LETTER || - cat == U_DECIMAL_DIGIT_NUMBER || + if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) || cat == U_CONNECTOR_PUNCTUATION || cat == U_MATH_SYMBOL || - cat == U_CURRENCY_SYMBOL || - cat == U_INITIAL_PUNCTUATION || - cat == U_FINAL_PUNCTUATION) { + cat == U_CURRENCY_SYMBOL) { *d++ = u_tolower (t); } }