From c69f378fa39821caadd601c69909c829ffe31ea4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 18 Jun 2019 10:11:09 +0100 Subject: [PATCH] [Feature] Lua_mimepart: Add function filter_words --- src/lua/lua_common.c | 2 + src/lua/lua_common.h | 5 +- src/lua/lua_mimepart.c | 144 +++++++++++++++++++++++++++++++++++++---- src/lua/lua_regexp.c | 34 +++++----- 4 files changed, 154 insertions(+), 31 deletions(-) diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c index 7e7a2854f..e268d6564 100644 --- a/src/lua/lua_common.c +++ b/src/lua/lua_common.c @@ -2173,6 +2173,8 @@ rspamd_lua_push_words (lua_State *L, GArray *words, /* Push to the resulting vector */ lua_rawseti (L, -2, cnt ++); break; + default: + break; } } diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index ee6fe9da6..ea07d7717 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -210,6 +210,8 @@ struct rspamd_lua_ip * lua_check_ip (lua_State * L, gint pos); struct rspamd_lua_text * lua_check_text (lua_State * L, gint pos); +struct rspamd_lua_regexp *lua_check_regexp (lua_State * L, gint pos); + enum rspamd_lua_task_header_type { RSPAMD_TASK_HEADER_PUSH_SIMPLE = 0, RSPAMD_TASK_HEADER_PUSH_RAW, @@ -464,7 +466,8 @@ enum rspamd_lua_words_type { RSPAMD_LUA_WORDS_STEM = 0, RSPAMD_LUA_WORDS_NORM, RSPAMD_LUA_WORDS_RAW, - RSPAMD_LUA_WORDS_FULL + RSPAMD_LUA_WORDS_FULL, + RSPAMD_LUA_WORDS_MAX }; /** * Pushes words (rspamd_stat_token_t) to Lua diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index 4bfa60456..2981939a0 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -146,6 +146,24 @@ LUA_FUNCTION_DEF (textpart, get_words_count); */ LUA_FUNCTION_DEF (textpart, get_words); +/*** + * @method mime_part:filter_words(regexp, [how][, max]]) + * Filter words using some regexp: + * - `stem`: stemmed words (default) + * - `norm`: normalised words (utf normalised + lowercased) + * - `raw`: raw words in utf (if possible) + * - `full`: list of tables, each table has the following fields: + * - [1] - stemmed word + * - [2] - normalised word + * - [3] - raw word + * - [4] - flags (table of strings) + * @param {rspamd_regexp} regexp regexp to match + * @param {string} how what words to extract + * @param {number} max maximum number of hits returned (all hits if <= 0 or nil) + * @return {table/strings} words matching regexp + */ +LUA_FUNCTION_DEF (textpart, filter_words); + /*** * @method text_part:is_empty() * Returns `true` if the specified part is empty @@ -216,6 +234,7 @@ static const struct luaL_reg textpartlib_m[] = { LUA_INTERFACE_DEF (textpart, get_lines_count), LUA_INTERFACE_DEF (textpart, get_words_count), LUA_INTERFACE_DEF (textpart, get_words), + LUA_INTERFACE_DEF (textpart, filter_words), LUA_INTERFACE_DEF (textpart, is_empty), LUA_INTERFACE_DEF (textpart, is_html), LUA_INTERFACE_DEF (textpart, get_html), @@ -841,6 +860,27 @@ lua_textpart_get_words_count (lua_State *L) return 1; } +static inline enum rspamd_lua_words_type +word_extract_type_from_string (const gchar *how_str) +{ + enum rspamd_lua_words_type how = RSPAMD_LUA_WORDS_MAX; + + if (strcmp (how_str, "stem") == 0) { + how = RSPAMD_LUA_WORDS_STEM; + } + else if (strcmp (how_str, "norm") == 0) { + how = RSPAMD_LUA_WORDS_NORM; + } + else if (strcmp (how_str, "raw") == 0) { + how = RSPAMD_LUA_WORDS_RAW; + } + else if (strcmp (how_str, "full") == 0) { + how = RSPAMD_LUA_WORDS_FULL; + } + + return how; +} + static gint lua_textpart_get_words (lua_State *L) { @@ -859,24 +899,102 @@ lua_textpart_get_words (lua_State *L) if (lua_type (L, 2) == LUA_TSTRING) { const gchar *how_str = lua_tostring (L, 2); - if (strcmp (how_str, "stem") == 0) { - how = RSPAMD_LUA_WORDS_STEM; - } - else if (strcmp (how_str, "norm") == 0) { - how = RSPAMD_LUA_WORDS_NORM; + how = word_extract_type_from_string (how_str); + + if (how == RSPAMD_LUA_WORDS_MAX) { + return luaL_error (L, "invalid extraction type: %s", how_str); } - else if (strcmp (how_str, "raw") == 0) { - how = RSPAMD_LUA_WORDS_RAW; + } + + return rspamd_lua_push_words (L, part->utf_words, how); + } + + return 1; +} + +static gint +lua_textpart_filter_words (lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_mime_text_part *part = lua_check_textpart (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 2); + gint lim = -1; + enum rspamd_lua_words_type how = RSPAMD_LUA_WORDS_STEM; + + if (part == NULL || re == NULL) { + return luaL_error (L, "invalid arguments"); + } + + if (IS_PART_EMPTY (part) || part->utf_words == NULL) { + lua_createtable (L, 0, 0); + } + else { + if (lua_type (L, 3) == LUA_TSTRING) { + const gchar *how_str = lua_tostring (L, 2); + + how = word_extract_type_from_string (how_str); + + if (how == RSPAMD_LUA_WORDS_MAX) { + return luaL_error (L, "invalid extraction type: %s", how_str); } - else if (strcmp (how_str, "full") == 0) { - how = RSPAMD_LUA_WORDS_FULL; + } + + if (lua_type (L, 4) == LUA_TNUMBER) { + lim = lua_tointeger (L, 4); + } + + guint cnt, i; + + lua_createtable (L, 8, 0); + + for (i = 0, cnt = 1; i < part->utf_words->len; i ++) { + rspamd_stat_token_t *w = &g_array_index (part->utf_words, + rspamd_stat_token_t, i); + + switch (how) { + case RSPAMD_LUA_WORDS_STEM: + if (w->stemmed.len > 0) { + if (rspamd_regexp_match (re->re, w->stemmed.begin, + w->stemmed.len, FALSE)) { + lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); + lua_rawseti (L, -2, cnt++); + } + } + break; + case RSPAMD_LUA_WORDS_NORM: + if (w->normalized.len > 0) { + if (rspamd_regexp_match (re->re, w->normalized.begin, + w->normalized.len, FALSE)) { + lua_pushlstring (L, w->normalized.begin, w->normalized.len); + lua_rawseti (L, -2, cnt++); + } + } + break; + case RSPAMD_LUA_WORDS_RAW: + if (w->original.len > 0) { + if (rspamd_regexp_match (re->re, w->original.begin, + w->original.len, TRUE)) { + lua_pushlstring (L, w->original.begin, w->original.len); + lua_rawseti (L, -2, cnt++); + } + } + break; + case RSPAMD_LUA_WORDS_FULL: + if (rspamd_regexp_match (re->re, w->normalized.begin, + w->normalized.len, FALSE)) { + rspamd_lua_push_full_word (L, w); + /* Push to the resulting vector */ + lua_rawseti (L, -2, cnt++); + } + break; + default: + break; } - else { - return luaL_error (L, "unknown words type: %s", how_str); + + if (lim > 0 && cnt >= lim) { + break; } } - - return rspamd_lua_push_words (L, part->utf_words, how); } return 1; diff --git a/src/lua/lua_regexp.c b/src/lua/lua_regexp.c index 4e233448b..4033722a3 100644 --- a/src/lua/lua_regexp.c +++ b/src/lua/lua_regexp.c @@ -72,12 +72,12 @@ static const struct luaL_reg regexplib_f[] = { rspamd_mempool_t *regexp_static_pool = NULL; -static struct rspamd_lua_regexp * -lua_check_regexp (lua_State * L) +struct rspamd_lua_regexp * +lua_check_regexp (lua_State * L, gint pos) { - void *ud = rspamd_lua_check_udata (L, 1, "rspamd{regexp}"); + void *ud = rspamd_lua_check_udata (L, pos, "rspamd{regexp}"); - luaL_argcheck (L, ud != NULL, 1, "'regexp' expected"); + luaL_argcheck (L, ud != NULL, pos, "'regexp' expected"); return ud ? *((struct rspamd_lua_regexp **)ud) : NULL; } @@ -401,7 +401,7 @@ static int lua_regexp_get_pattern (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); if (re && re->re && !IS_DESTROYED (re)) { lua_pushstring (L, rspamd_regexp_get_pattern (re->re)); @@ -423,10 +423,10 @@ static int lua_regexp_set_limit (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); gint64 lim; - lim = luaL_checknumber (L, 2); + lim = lua_tointeger (L, 2); if (re && re->re && !IS_DESTROYED (re)) { if (lim > 0) { @@ -450,7 +450,7 @@ static int lua_regexp_set_max_hits (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); guint lim; lim = luaL_checkinteger (L, 2); @@ -474,7 +474,7 @@ static int lua_regexp_get_max_hits (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); if (re && re->re && !IS_DESTROYED (re)) { lua_pushinteger (L, rspamd_regexp_get_maxhits (re->re)); @@ -517,12 +517,12 @@ static int lua_regexp_search (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); const gchar *data = NULL; struct rspamd_lua_text *t; const gchar *start = NULL, *end = NULL; gint i; - gsize len, capn; + gsize len = 0, capn; gboolean matched = FALSE, capture = FALSE, raw = FALSE; GArray *captures = NULL; struct rspamd_re_capture *cap; @@ -611,7 +611,7 @@ static int lua_regexp_match (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); struct rspamd_lua_text *t; const gchar *data = NULL; gsize len = 0; @@ -669,7 +669,7 @@ static int lua_regexp_matchn (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); struct rspamd_lua_text *t; const gchar *data = NULL, *start = NULL, *end = NULL; gint max_matches, matches; @@ -688,7 +688,7 @@ lua_regexp_matchn (lua_State *L) } } - max_matches = lua_tonumber (L, 3); + max_matches = lua_tointeger (L, 3); if (lua_gettop (L) == 4) { raw = lua_toboolean (L, 4); @@ -741,7 +741,7 @@ static int lua_regexp_split (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *re = lua_check_regexp (L); + struct rspamd_lua_regexp *re = lua_check_regexp (L, 1); const gchar *data = NULL; struct rspamd_lua_text *t; gboolean matched = FALSE, is_text = FALSE; @@ -837,7 +837,7 @@ static gint lua_regexp_destroy (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *to_del = lua_check_regexp (L); + struct rspamd_lua_regexp *to_del = lua_check_regexp (L, 1); if (to_del) { rspamd_regexp_cache_remove (NULL, to_del->re); @@ -853,7 +853,7 @@ static gint lua_regexp_gc (lua_State *L) { LUA_TRACE_POINT; - struct rspamd_lua_regexp *to_del = lua_check_regexp (L); + struct rspamd_lua_regexp *to_del = lua_check_regexp (L, 1); if (to_del) { if (!IS_DESTROYED (to_del)) { -- 2.39.5