[Feature] Allow to get all types of words from Lua

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Fri, 30 Nov 2018 13:36:07 +0000 (13:36 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Fri, 30 Nov 2018 13:36:07 +0000 (13:36 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 30 Nov 2018 13:36:07 +0000 (13:36 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 30 Nov 2018 13:36:07 +0000 (13:36 +0000)
diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c

index dcab21530bdf2d68bf7a412477a5c0ab93f5b35f..fe463b76344a1ccad242fbea77b936d8fd21e941 100644 (file)
--- a/src/lua/lua_common.c
+++ b/src/lua/lua_common.c
@@ -21,9 +21,12 @@
  #include "ottery.h"
  #include "rspamd_control.h"
  #include "lua_thread_pool.h"
+#include "libstat/stat_api.h"
+#include "libserver/rspamd_control.h"
+
  #include <math.h>
  #include <sys/wait.h>
-#include <src/libserver/rspamd_control.h>
+
  
  /* Lua module init function */
  #define MODULE_INIT_FUNC "module_init"
@@ -2403,4 +2406,117 @@ rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
         }
  
         return FALSE;
+}
+
+gint
+rspamd_lua_push_words (lua_State *L, GArray *words,
+                                                       enum rspamd_lua_words_type how)
+{
+       rspamd_stat_token_t *w;
+       guint i, cnt, fl_cnt;
+
+       lua_createtable (L, words->len, 0);
+
+       for (i = 0, cnt = 1; i < words->len; i ++) {
+               w = &g_array_index (words, rspamd_stat_token_t, i);
+
+               switch (how) {
+               case RSPAMD_LUA_WORDS_STEM:
+                       if (w->stemmed.len > 0) {
+                               lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
+                               lua_rawseti (L, -2, cnt ++);
+                       }
+                       break;
+               case RSPAMD_LUA_WORDS_NORM:
+                       if (w->normalized.len > 0) {
+                               lua_pushlstring (L, w->normalized.begin, w->normalized.len);
+                               lua_rawseti (L, -2, cnt ++);
+                       }
+                       break;
+               case RSPAMD_LUA_WORDS_RAW:
+                       if (w->original.len > 0) {
+                               lua_pushlstring (L, w->original.begin, w->original.len);
+                               lua_rawseti (L, -2, cnt ++);
+                       }
+                       break;
+               case RSPAMD_LUA_WORDS_FULL:
+                       lua_createtable (L, 4, 0);
+
+                       if (w->stemmed.len > 0) {
+                               lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
+                               lua_rawseti (L, -2, 1);
+                       }
+                       else {
+                               lua_pushstring (L, "");
+                               lua_rawseti (L, -2, 1);
+                       }
+
+                       if (w->normalized.len > 0) {
+                               lua_pushlstring (L, w->normalized.begin, w->normalized.len);
+                               lua_rawseti (L, -2, 2);
+                       }
+                       else {
+                               lua_pushstring (L, "");
+                               lua_rawseti (L, -2, 2);
+                       }
+
+                       if (w->original.len > 0) {
+                               lua_pushlstring (L, w->original.begin, w->original.len);
+                               lua_rawseti (L, -2, 3);
+                       }
+                       else {
+                               lua_pushstring (L, "");
+                               lua_rawseti (L, -2, 3);
+                       }
+
+                       /* Flags part */
+                       fl_cnt = 1;
+                       lua_createtable (L, 4, 0);
+
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_NORMALISED) {
+                               lua_pushstring (L, "normalised");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE) {
+                               lua_pushstring (L, "broken_unicode");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
+                               lua_pushstring (L, "utf");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                               lua_pushstring (L, "text");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
+                               lua_pushstring (L, "header");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_META|RSPAMD_STAT_TOKEN_FLAG_LUA_META)) {
+                               lua_pushstring (L, "meta");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
+                               lua_pushstring (L, "stop_word");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES) {
+                               lua_pushstring (L, "invisible_spaces");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) {
+                               lua_pushstring (L, "stemmed");
+                               lua_rawseti (L, -2, fl_cnt ++);
+                       }
+
+                       lua_rawseti (L, -2, 4);
+
+                       /* Push to the resulting vector */
+                       lua_rawseti (L, -2, cnt ++);
+                       break;
+               }
+
+               return 1;
+       }
  }
 \ No newline at end of file
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h

index 2dee888eeb11d6ab2bf7b5896220b299706bf416..25f5b7ff4892368b89896161ca74c5a924dfbbd9 100644 (file)
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -433,6 +433,22 @@ gboolean rspamd_lua_require_function (lua_State *L, const gchar *modname,
  gboolean rspamd_lua_try_load_redis (lua_State *L, const ucl_object_t *obj,
                 struct rspamd_config *cfg, gint *ref_id);
  
+
+enum rspamd_lua_words_type {
+       RSPAMD_LUA_WORDS_STEM = 0,
+       RSPAMD_LUA_WORDS_NORM,
+       RSPAMD_LUA_WORDS_RAW,
+       RSPAMD_LUA_WORDS_FULL
+};
+/**
+ * Pushes words (rspamd_stat_token_t) to Lua
+ * @param L
+ * @param words
+ * @param how
+ */
+gint rspamd_lua_push_words (lua_State *L, GArray *words,
+               enum rspamd_lua_words_type how);
+
  /* Paths defs */
  #define RSPAMD_CONFDIR_INDEX "CONFDIR"
  #define RSPAMD_LOCAL_CONFDIR_INDEX "LOCAL_CONFDIR"
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c

index d2ff7e8e449d4e3e44d57aec9062aabeea29295c..14111f760f6ba6832212259898d11dd005585655 100644 (file)
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -132,8 +132,16 @@ LUA_FUNCTION_DEF (textpart, get_stats);
  LUA_FUNCTION_DEF (textpart, get_words_count);
  
  /***
- * @method mime_part:get_words()
- * Get words in the part
+ * @method mime_part:get_words([how])
+ * Get words in the part. Optional `how` argument defines type of words returned:
+ * - `stem`: stemmed words (default)
+ * - `norm`: normalised words (utf normalised + lowercased)
+ * - `raw`: raw words in utf (if possible)
+ * - `full`: list of tables, each table has the following fields:
+ *   - [1] - stemmed word
+ *   - [2] - normalised word
+ *   - [3] - raw word
+ *   - [4] - flags (table of strings)
   * @return {table/strings} words in the part
   */
  LUA_FUNCTION_DEF (textpart, get_words);
@@ -759,8 +767,7 @@ lua_textpart_get_words (lua_State *L)
  {
         LUA_TRACE_POINT;
         struct rspamd_mime_text_part *part = lua_check_textpart (L);
-       rspamd_stat_token_t *w;
-       guint i;
+       enum rspamd_lua_words_type how = RSPAMD_LUA_WORDS_STEM;
  
         if (part == NULL) {
                 return luaL_error (L, "invalid arguments");
@@ -770,14 +777,27 @@ lua_textpart_get_words (lua_State *L)
                 lua_createtable (L, 0, 0);
         }
         else {
-               lua_createtable (L, part->utf_words->len, 0);
-
-               for (i = 0; i < part->utf_words->len; i ++) {
-                       w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
+               if (lua_type (L, 2) == LUA_TSTRING) {
+                       const gchar *how_str = lua_tostring (L, 2);
  
-                       lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
-                       lua_rawseti (L, -2, i + 1);
+                       if (strcmp (how_str, "stem") == 0) {
+                               how = RSPAMD_LUA_WORDS_STEM;
+                       }
+                       else if (strcmp (how_str, "norm") == 0) {
+                               how = RSPAMD_LUA_WORDS_NORM;
+                       }
+                       else if (strcmp (how_str, "raw") == 0) {
+                               how = RSPAMD_LUA_WORDS_RAW;
+                       }
+                       else if (strcmp (how_str, "full") == 0) {
+                               how = RSPAMD_LUA_WORDS_FULL;
+                       }
+                       else {
+                               return luaL_error (L, "unknown words type: %s", how_str);
+                       }
                 }
+
+               return rspamd_lua_push_words (L, part->utf_words, how);
         }
  
         return 1;
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Fri, 30 Nov 2018 13:36:07 +0000 (13:36 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Fri, 30 Nov 2018 13:36:07 +0000 (13:36 +0000)
src/lua/lua_common.c		patch \| blob \| history
src/lua/lua_common.h		patch \| blob \| history
src/lua/lua_mimepart.c		patch \| blob \| history