From c31f8bf12bff61c9422de9eeff0292c6ac339c5e Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 6 Sep 2018 19:49:44 +0100 Subject: [Feature] Implement new text tokenizer based on libicu --- src/lua/lua_util.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'src/lua') diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 3de68e60a..d6095ab52 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -1078,6 +1078,7 @@ lua_util_tokenize_text (lua_State *L) GList *exceptions = NULL, *cur; struct rspamd_lua_text *t; struct rspamd_process_exception *ex; + UText utxt = UTEXT_INITIALIZER; GArray *res; rspamd_stat_token_t *w; @@ -1129,7 +1130,15 @@ lua_util_tokenize_text (lua_State *L) exceptions = g_list_reverse (exceptions); } - res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL, + UErrorCode uc_err = U_ZERO_ERROR; + utext_openUTF8 (&utxt, + in, + len, + &uc_err); + + res = rspamd_tokenize_text ((gchar *)in, len, + &utxt, + RSPAMD_TOKENIZE_UTF, NULL, exceptions, NULL); -- cgit v1.2.3