aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/stat_process.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-06 19:49:44 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-06 19:50:18 +0100
commitc31f8bf12bff61c9422de9eeff0292c6ac339c5e (patch)
tree224c38634f5d6f45218752ca3abb1b39bc7e4093 /src/libstat/stat_process.c
parentaf5f57916e4345d988802794c84460960ee47d0c (diff)
downloadrspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.tar.gz
rspamd-c31f8bf12bff61c9422de9eeff0292c6ac339c5e.zip
[Feature] Implement new text tokenizer based on libicu
Diffstat (limited to 'src/libstat/stat_process.c')
-rw-r--r--src/libstat/stat_process.c12
1 files changed, 11 insertions, 1 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 394173444..6d34ba51c 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -365,8 +365,18 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
if (sub != NULL) {
- words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF,
+ UText utxt = UTEXT_INITIALIZER;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gsize slen = strlen (sub);
+
+ utext_openUTF8 (&utxt,
+ sub,
+ slen,
+ &uc_err);
+
+ words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
NULL, NULL, NULL);
+
if (words != NULL) {
for (i = 0; i < words->len; i ++) {