diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-08-27 18:20:59 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-08-28 10:05:24 +0100 |
commit | 44776f99b8d31fc26540f5bbb9281361df1a3c63 (patch) | |
tree | 9411b78ebac6499d67458c7b38f7f9b8e11cd1ab | |
parent | 24f77f81a8928479f99ff96fb1f1d1c0f64886ee (diff) | |
download | rspamd-44776f99b8d31fc26540f5bbb9281361df1a3c63.tar.gz rspamd-44776f99b8d31fc26540f5bbb9281361df1a3c63.zip |
[Fix] Fix normalization of non-alphabet based languages
-rw-r--r-- | src/libstat/backends/redis_backend.c | 3 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 8 |
2 files changed, 4 insertions, 7 deletions
diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index d54767c12..baeb2308d 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -527,7 +527,8 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b_tokens %b %b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->stemmed.begin, tok->t1->stemmed.len); + tok->t1->stemmed.begin, + tok->t1->stemmed.len); } } else { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index f69378f9b..ffa1af9db 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -627,14 +627,10 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, } #endif - if (cat == U_UPPERCASE_LETTER || - cat == U_LOWERCASE_LETTER || - cat == U_DECIMAL_DIGIT_NUMBER || + if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) || cat == U_CONNECTOR_PUNCTUATION || cat == U_MATH_SYMBOL || - cat == U_CURRENCY_SYMBOL || - cat == U_INITIAL_PUNCTUATION || - cat == U_FINAL_PUNCTUATION) { + cat == U_CURRENCY_SYMBOL) { *d++ = u_tolower (t); } } |