From 67a6d822c639ba55546fbcb19db267cee6ac11b2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 30 Nov 2018 13:36:07 +0000 Subject: [PATCH] [Feature] Allow to get all types of words from Lua --- src/lua/lua_common.c | 118 ++++++++++++++++++++++++++++++++++++++++- src/lua/lua_common.h | 16 ++++++ src/lua/lua_mimepart.c | 40 ++++++++++---- 3 files changed, 163 insertions(+), 11 deletions(-) diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c index dcab21530..fe463b763 100644 --- a/src/lua/lua_common.c +++ b/src/lua/lua_common.c @@ -21,9 +21,12 @@ #include "ottery.h" #include "rspamd_control.h" #include "lua_thread_pool.h" +#include "libstat/stat_api.h" +#include "libserver/rspamd_control.h" + #include #include -#include + /* Lua module init function */ #define MODULE_INIT_FUNC "module_init" @@ -2403,4 +2406,117 @@ rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj, } return FALSE; +} + +gint +rspamd_lua_push_words (lua_State *L, GArray *words, + enum rspamd_lua_words_type how) +{ + rspamd_stat_token_t *w; + guint i, cnt, fl_cnt; + + lua_createtable (L, words->len, 0); + + for (i = 0, cnt = 1; i < words->len; i ++) { + w = &g_array_index (words, rspamd_stat_token_t, i); + + switch (how) { + case RSPAMD_LUA_WORDS_STEM: + if (w->stemmed.len > 0) { + lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); + lua_rawseti (L, -2, cnt ++); + } + break; + case RSPAMD_LUA_WORDS_NORM: + if (w->normalized.len > 0) { + lua_pushlstring (L, w->normalized.begin, w->normalized.len); + lua_rawseti (L, -2, cnt ++); + } + break; + case RSPAMD_LUA_WORDS_RAW: + if (w->original.len > 0) { + lua_pushlstring (L, w->original.begin, w->original.len); + lua_rawseti (L, -2, cnt ++); + } + break; + case RSPAMD_LUA_WORDS_FULL: + lua_createtable (L, 4, 0); + + if (w->stemmed.len > 0) { + lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); + lua_rawseti (L, -2, 1); + } + else { + lua_pushstring (L, ""); + lua_rawseti (L, -2, 1); + } + + if (w->normalized.len > 0) { + lua_pushlstring (L, w->normalized.begin, w->normalized.len); + lua_rawseti (L, -2, 2); + } + else { + lua_pushstring (L, ""); + lua_rawseti (L, -2, 2); + } + + if (w->original.len > 0) { + lua_pushlstring (L, w->original.begin, w->original.len); + lua_rawseti (L, -2, 3); + } + else { + lua_pushstring (L, ""); + lua_rawseti (L, -2, 3); + } + + /* Flags part */ + fl_cnt = 1; + lua_createtable (L, 4, 0); + + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) { + lua_pushstring (L, "normalised"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) { + lua_pushstring (L, "broken_unicode"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { + lua_pushstring (L, "utf"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + lua_pushstring (L, "text"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) { + lua_pushstring (L, "header"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) { + lua_pushstring (L, "meta"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) { + lua_pushstring (L, "stop_word"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) { + lua_pushstring (L, "invisible_spaces"); + lua_rawseti (L, -2, fl_cnt ++); + } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) { + lua_pushstring (L, "stemmed"); + lua_rawseti (L, -2, fl_cnt ++); + } + + lua_rawseti (L, -2, 4); + + /* Push to the resulting vector */ + lua_rawseti (L, -2, cnt ++); + break; + } + + return 1; + } } \ No newline at end of file diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index 2dee888ee..25f5b7ff4 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -433,6 +433,22 @@ gboolean rspamd_lua_require_function (lua_State *L, const gchar *modname, gboolean rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj, struct rspamd_config *cfg, gint *ref_id); + +enum rspamd_lua_words_type { + RSPAMD_LUA_WORDS_STEM = 0, + RSPAMD_LUA_WORDS_NORM, + RSPAMD_LUA_WORDS_RAW, + RSPAMD_LUA_WORDS_FULL +}; +/** + * Pushes words (rspamd_stat_token_t) to Lua + * @param L + * @param words + * @param how + */ +gint rspamd_lua_push_words (lua_State *L, GArray *words, + enum rspamd_lua_words_type how); + /* Paths defs */ #define RSPAMD_CONFDIR_INDEX "CONFDIR" #define RSPAMD_LOCAL_CONFDIR_INDEX "LOCAL_CONFDIR" diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index d2ff7e8e4..14111f760 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -132,8 +132,16 @@ LUA_FUNCTION_DEF (textpart, get_stats); LUA_FUNCTION_DEF (textpart, get_words_count); /*** - * @method mime_part:get_words() - * Get words in the part + * @method mime_part:get_words([how]) + * Get words in the part. Optional `how` argument defines type of words returned: + * - `stem`: stemmed words (default) + * - `norm`: normalised words (utf normalised + lowercased) + * - `raw`: raw words in utf (if possible) + * - `full`: list of tables, each table has the following fields: + * - [1] - stemmed word + * - [2] - normalised word + * - [3] - raw word + * - [4] - flags (table of strings) * @return {table/strings} words in the part */ LUA_FUNCTION_DEF (textpart, get_words); @@ -759,8 +767,7 @@ lua_textpart_get_words (lua_State *L) { LUA_TRACE_POINT; struct rspamd_mime_text_part *part = lua_check_textpart (L); - rspamd_stat_token_t *w; - guint i; + enum rspamd_lua_words_type how = RSPAMD_LUA_WORDS_STEM; if (part == NULL) { return luaL_error (L, "invalid arguments"); @@ -770,14 +777,27 @@ lua_textpart_get_words (lua_State *L) lua_createtable (L, 0, 0); } else { - lua_createtable (L, part->utf_words->len, 0); - - for (i = 0; i < part->utf_words->len; i ++) { - w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); + if (lua_type (L, 2) == LUA_TSTRING) { + const gchar *how_str = lua_tostring (L, 2); - lua_pushlstring (L, w->stemmed.begin, w->stemmed.len); - lua_rawseti (L, -2, i + 1); + if (strcmp (how_str, "stem") == 0) { + how = RSPAMD_LUA_WORDS_STEM; + } + else if (strcmp (how_str, "norm") == 0) { + how = RSPAMD_LUA_WORDS_NORM; + } + else if (strcmp (how_str, "raw") == 0) { + how = RSPAMD_LUA_WORDS_RAW; + } + else if (strcmp (how_str, "full") == 0) { + how = RSPAMD_LUA_WORDS_FULL; + } + else { + return luaL_error (L, "unknown words type: %s", how_str); + } } + + return rspamd_lua_push_words (L, part->utf_words, how); } return 1; -- 2.39.5