diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-05-20 22:17:45 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-05-20 22:17:45 +0100 |
commit | 255340bb9fbc5e13d3eae070762a5af5f8275fe3 (patch) | |
tree | 68e255d6ab5b78cc3a050e8a694fde14bfa267c5 | |
parent | fc91dd7a8186ef9d4d9f2e861f5d6e482033e748 (diff) | |
download | rspamd-255340bb9fbc5e13d3eae070762a5af5f8275fe3.tar.gz rspamd-255340bb9fbc5e13d3eae070762a5af5f8275fe3.zip |
Add lua bindings to tokenizer.
-rw-r--r-- | src/lua/lua_util.c | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 20c14d24d..37bf98ead 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -25,6 +25,7 @@ #include "task.h" #include "main.h" #include "cfg_rcl.h" +#include "tokenizers/tokenizers.h" /*** * @function util.create_event_base() @@ -52,6 +53,14 @@ LUA_FUNCTION_DEF (util, config_from_ucl); * @return {rspamd_text} encoded data chunk */ LUA_FUNCTION_DEF (util, encode_base64); +/*** + * @function util.tokenize_text(input[, exceptions]) + * Create tokens from a text using optional exceptions list + * @param {text/string} input input data + * @param {table} exceptions, a table of pairs containing <start_pos,lenght> of exceptions in the input + * @return {table/strings} list of strings representing words in the text + */ +LUA_FUNCTION_DEF (util, tokenize_text); LUA_FUNCTION_DEF (util, process_message); static const struct luaL_reg utillib_f[] = { @@ -60,6 +69,7 @@ static const struct luaL_reg utillib_f[] = { LUA_INTERFACE_DEF (util, config_from_ucl), LUA_INTERFACE_DEF (util, process_message), LUA_INTERFACE_DEF (util, encode_base64), + LUA_INTERFACE_DEF (util, tokenize_text), {NULL, NULL} }; @@ -251,6 +261,92 @@ lua_util_encode_base64 (lua_State *L) } static gint +lua_util_tokenize_text (lua_State *L) +{ + const gchar *in = NULL; + gsize len, pos, ex_len, i; + GList *exceptions = NULL, *cur; + struct rspamd_lua_text *t; + struct process_exception *ex; + GArray *res; + rspamd_fstring_t *w; + gboolean compat = FALSE; + + if (lua_type (L, 1) == LUA_TSTRING) { + in = luaL_checklstring (L, 1, &len); + } + else if (lua_type (L, 1) == LUA_TTABLE) { + t = lua_check_text (L, 1); + + if (t) { + in = t->start; + len = t->len; + } + } + + if (in == NULL) { + lua_pushnil (L); + return 1; + } + + if (lua_gettop (L) > 1 && lua_type (L, 2) == LUA_TTABLE) { + lua_pushvalue (L, 2); + lua_pushnil (L); + + while (lua_next (L, -2) != 0) { + if (lua_type (L, -1) == LUA_TTABLE) { + lua_rawgeti (L, -1, 1); + pos = luaL_checknumber (L, -1); + lua_pop (L, 1); + lua_rawgeti (L, -1, 2); + ex_len = luaL_checknumber (L, -1); + lua_pop (L, 1); + + if (ex_len > 0) { + ex = g_slice_alloc (sizeof (*ex)); + ex->pos = pos; + ex->len = ex_len; + exceptions = g_list_prepend (exceptions, ex); + } + } + lua_pop (L, 1); + } + + lua_pop (L, 1); + } + + if (lua_gettop (L) > 2 && lua_type (L, 3) == LUA_TBOOLEAN) { + compat = lua_toboolean (L, 3); + } + + res = rspamd_tokenize_text ((gchar *)in, len, TRUE, 0, exceptions, compat); + + if (res == NULL) { + lua_pushnil (L); + } + else { + lua_newtable (L); + + for (i = 0; i < res->len; i ++) { + w = &g_array_index (res, rspamd_fstring_t, i); + lua_pushlstring (L, w->begin, w->len); + lua_settable (L, i + 1); + } + } + + cur = exceptions; + while (cur) { + ex = cur->data; + g_slice_free1 (sizeof (*ex), ex); + cur = g_list_next (cur); + } + + g_list_free (exceptions); + + return 1; +} + +static gint lua_load_util (lua_State * L) { lua_newtable (L); |