diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-06 19:49:44 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-06 19:50:18 +0100 |
commit | c31f8bf12bff61c9422de9eeff0292c6ac339c5e (patch) | |
tree | 224c38634f5d6f45218752ca3abb1b39bc7e4093 /src/libstat/stat_process.c | |
parent | af5f57916e4345d988802794c84460960ee47d0c (diff) | |
download | rspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.tar.gz rspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.zip |
[Feature] Implement new text tokenizer based on libicu
Diffstat (limited to 'src/libstat/stat_process.c')
-rw-r--r-- | src/libstat/stat_process.c | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 394173444..6d34ba51c 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -365,8 +365,18 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (sub != NULL) { - words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF, + UText utxt = UTEXT_INITIALIZER; + UErrorCode uc_err = U_ZERO_ERROR; + gsize slen = strlen (sub); + + utext_openUTF8 (&utxt, + sub, + slen, + &uc_err); + + words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF, NULL, NULL, NULL); + if (words != NULL) { for (i = 0; i < words->len; i ++) { |