diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-12-05 18:06:12 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-12-05 18:06:12 +0000 |
commit | e1b043f8bf7970278f55ae7ca1a106dee6c4fa98 (patch) | |
tree | 7c4e4205a7eb9341fa1196d8fda5997d20b6976c | |
parent | d027dca0f2c45caa9d8a26e476d44f94a92f639e (diff) | |
download | rspamd-e1b043f8bf7970278f55ae7ca1a106dee6c4fa98.tar.gz rspamd-e1b043f8bf7970278f55ae7ca1a106dee6c4fa98.zip |
[Feature] Add method task:lookup_words
-rw-r--r-- | src/lua/lua_common.c | 153 | ||||
-rw-r--r-- | src/lua/lua_common.h | 7 | ||||
-rw-r--r-- | src/lua/lua_task.c | 122 |
3 files changed, 207 insertions, 75 deletions
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c index 7bb45f347..01d5dc869 100644 --- a/src/lua/lua_common.c +++ b/src/lua/lua_common.c @@ -2408,12 +2408,90 @@ rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj, return FALSE; } +void +rspamd_lua_push_full_word (lua_State *L, rspamd_stat_token_t *w) +{ + gint fl_cnt; + + lua_createtable (L, 4, 0); + + if (w->stemmed.len > 0) { + lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); + lua_rawseti (L, -2, 1); + } + else { + lua_pushstring (L, ""); + lua_rawseti (L, -2, 1); + } + + if (w->normalized.len > 0) { + lua_pushlstring (L, w->normalized.begin, w->normalized.len); + lua_rawseti (L, -2, 2); + } + else { + lua_pushstring (L, ""); + lua_rawseti (L, -2, 2); + } + + if (w->original.len > 0) { + lua_pushlstring (L, w->original.begin, w->original.len); + lua_rawseti (L, -2, 3); + } + else { + lua_pushstring (L, ""); + lua_rawseti (L, -2, 3); + } + + /* Flags part */ + fl_cnt = 1; + lua_createtable (L, 4, 0); + + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) { + lua_pushstring (L, "normalised"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) { + lua_pushstring (L, "broken_unicode"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + lua_pushstring (L, "utf"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + lua_pushstring (L, "text"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) { + lua_pushstring (L, "header"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) { + lua_pushstring (L, "meta"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) { + lua_pushstring (L, "stop_word"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) { + lua_pushstring (L, "invisible_spaces"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) { + lua_pushstring (L, "stemmed"); + lua_rawseti (L, -2, fl_cnt ++); + } + + lua_rawseti (L, -2, 4); +} + gint rspamd_lua_push_words (lua_State *L, GArray *words, enum rspamd_lua_words_type how) { rspamd_stat_token_t *w; - guint i, cnt, fl_cnt; + guint i, cnt; lua_createtable (L, words->len, 0); @@ -2440,78 +2518,7 @@ rspamd_lua_push_words (lua_State *L, GArray *words, } break; case RSPAMD_LUA_WORDS_FULL: - lua_createtable (L, 4, 0); - - if (w->stemmed.len > 0) { - lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); - lua_rawseti (L, -2, 1); - } - else { - lua_pushstring (L, ""); - lua_rawseti (L, -2, 1); - } - - if (w->normalized.len > 0) { - lua_pushlstring (L, w->normalized.begin, w->normalized.len); - lua_rawseti (L, -2, 2); - } - else { - lua_pushstring (L, ""); - lua_rawseti (L, -2, 2); - } - - if (w->original.len > 0) { - lua_pushlstring (L, w->original.begin, w->original.len); - lua_rawseti (L, -2, 3); - } - else { - lua_pushstring (L, ""); - lua_rawseti (L, -2, 3); - } - - /* Flags part */ - fl_cnt = 1; - lua_createtable (L, 4, 0); - - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) { - lua_pushstring (L, "normalised"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) { - lua_pushstring (L, "broken_unicode"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { - lua_pushstring (L, "utf"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { - lua_pushstring (L, "text"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) { - lua_pushstring (L, "header"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) { - lua_pushstring (L, "meta"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) { - lua_pushstring (L, "stop_word"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) { - lua_pushstring (L, "invisible_spaces"); - lua_rawseti (L, -2, fl_cnt ++); - } - if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) { - lua_pushstring (L, "stemmed"); - lua_rawseti (L, -2, fl_cnt ++); - } - - lua_rawseti (L, -2, 4); - + rspamd_lua_push_full_word (L, w); /* Push to the resulting vector */ lua_rawseti (L, -2, cnt ++); break; diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index 25f5b7ff4..31d7f852b 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -433,6 +433,13 @@ gboolean rspamd_lua_require_function (lua_State *L, const gchar *modname, gboolean rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj, struct rspamd_config *cfg, gint *ref_id); +struct rspamd_stat_token_s; +/** + * Pushes a single word into Lua + * @param L + * @param word + */ +void rspamd_lua_push_full_word (lua_State *L, struct rspamd_stat_token_s *word); enum rspamd_lua_words_type { RSPAMD_LUA_WORDS_STEM = 0, diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 6f4923dc8..a8a53f517 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -21,11 +21,12 @@ #include "unix-std.h" #include "libmime/smtp_parsers.h" #include "libserver/mempool_vars_internal.h" +#include "libserver/dkim.h" #include "libserver/task.h" #include "libstat/stat_api.h" +#include "libutil/map_helpers.h" + #include <math.h> -#include <src/libserver/task.h> -#include <src/libserver/dkim.h> /*** * @module rspamd_task @@ -958,6 +959,17 @@ LUA_FUNCTION_DEF (task, get_newlines_type); */ LUA_FUNCTION_DEF (task, get_stat_tokens); +/*** + * @method task:lookup_words(map, function({o, n, s, f}) ... end) + * Matches words in a task (including meta words) against some map (set, regexp and so on) + * and call the specified function with a table containing 4 values: + * - [1] - stemmed word + * - [2] - normalised word + * - [3] - raw word + * - [4] - flags (table of strings) + */ +LUA_FUNCTION_DEF (task, lookup_words); + static const struct luaL_reg tasklib_f[] = { LUA_INTERFACE_DEF (task, load_from_file), LUA_INTERFACE_DEF (task, load_from_string), @@ -1060,6 +1072,7 @@ static const struct luaL_reg tasklib_m[] = { LUA_INTERFACE_DEF (task, get_newlines_type), LUA_INTERFACE_DEF (task, get_stat_tokens), LUA_INTERFACE_DEF (task, get_meta_words), + LUA_INTERFACE_DEF (task, lookup_words), {"__tostring", rspamd_lua_class_tostring}, {NULL, NULL} }; @@ -5171,6 +5184,111 @@ lua_task_get_meta_words (lua_State *L) return 1; } +static guint +lua_lookup_words_array (lua_State *L, + gint cbpos, + struct rspamd_task *task, + struct rspamd_lua_map *map, + GArray *words) +{ + rspamd_stat_token_t *tok; + guint i, nmatched = 0; + gint err_idx; + gboolean matched; + const gchar *key; + gsize keylen; + + for (i = 0; i < words->len; i ++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + + matched = FALSE; + + if (tok->normalized.len == 0) { + continue; + } + + key = tok->normalized.begin; + keylen = tok->normalized.len; + + switch (map->type) { + case RSPAMD_LUA_MAP_SET: + case RSPAMD_LUA_MAP_HASH: + /* We know that tok->normalized is zero terminated in fact */ + if (rspamd_match_hash_map (map->data.hash, key)) { + matched = TRUE; + } + break; + case RSPAMD_LUA_MAP_REGEXP: + case RSPAMD_LUA_MAP_REGEXP_MULTIPLE: + if (rspamd_match_regexp_map_single (map->data.re_map, key, + keylen)) { + matched = TRUE; + } + break; + default: + g_assert_not_reached (); + break; + } + + if (matched) { + nmatched ++; + + lua_pushcfunction (L, &rspamd_lua_traceback); + err_idx = lua_gettop (L); + lua_pushvalue (L, cbpos); /* Function */ + rspamd_lua_push_full_word (L, tok); + + if (lua_pcall (L, 1, 0, err_idx) != 0) { + GString *tb = lua_touserdata (L, -1); + msg_err_task ("cannot call callback function for lookup words: %s", + tb->str); + g_string_free (tb, TRUE); + } + + lua_settop (L, err_idx - 1); + } + } + + return nmatched; +} + +static gint +lua_task_lookup_words (lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_task *task = lua_check_task (L, 1); + struct rspamd_lua_map *map = lua_check_map (L, 2); + struct rspamd_mime_text_part *tp; + + guint i, matches = 0; + + if (task == NULL || map == NULL || lua_type (L, 3) != LUA_TFUNCTION) { + return luaL_error (L, "invalid arguments"); + } + + if (map->type != RSPAMD_LUA_MAP_SET && + map->type != RSPAMD_LUA_MAP_REGEXP && + map->type != RSPAMD_LUA_MAP_HASH && + map->type != RSPAMD_LUA_MAP_REGEXP_MULTIPLE) { + return luaL_error (L, "invalid map type"); + } + + PTR_ARRAY_FOREACH (task->text_parts, i, tp) { + if (tp->utf_words) { + matches += lua_lookup_words_array (L, 3, task, map, tp->utf_words); + } + } + + if (task->meta_words) { + matches += lua_lookup_words_array (L, 3, task, map, task->meta_words); + } + + lua_pushinteger (L, matches); + + return 1; +} + + /* Image functions */ static gint lua_image_get_width (lua_State *L) |