]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Use more generalised API to produce meta words
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Nov 2018 16:40:51 +0000 (16:40 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 26 Nov 2018 16:40:51 +0000 (16:40 +0000)
src/libmime/message.c
src/libserver/task.c
src/libserver/task.h
src/libstat/stat_api.h
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/lua/lua_task.c
src/lua/lua_util.c
src/plugins/chartable.c

index 994720f76a0af0400b086cba7ee4cfe0cc7a1415..46f528ba7ac94d484288c70e5d7b0456b45e665e 100644 (file)
@@ -173,7 +173,7 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
                        &part->utf_stripped_text,
                        tok_type, task->cfg,
                        part->exceptions,
-                       NULL);
+                       NULL, NULL);
 
 
        if (part->utf_words) {
@@ -1278,6 +1278,8 @@ rspamd_message_process (struct rspamd_task *task)
                        *var /= (double)total_words;
                }
        }
+
+       rspamd_tokenize_meta_words (task);
 }
 
 
index 6135bced4265bc168218d72c9dcb18fe709d2664..664715feac80f12cdf380d145a6d377640d88925 100644 (file)
@@ -282,6 +282,10 @@ rspamd_task_free (struct rspamd_task *task)
                        rspamd_email_address_free (task->from_envelope);
                }
 
+               if (task->meta_words) {
+                       g_array_free (task->meta_words, TRUE);
+               }
+
                ucl_object_unref (task->messages);
 
                if (task->re_rt) {
index 005f6af26787cb475ab4f9852d081234be02e8fd..b41b308e4d751545ade99cd8fabc057ee08b0a53 100644 (file)
@@ -173,6 +173,8 @@ struct rspamd_task {
        struct rspamd_metric_result *result;                    /**< Metric result                                                                      */
        GHashTable *lua_cache;                                                  /**< cache of lua objects                                                       */
        GPtrArray *tokens;                                                              /**< statistics tokens */
+       GArray *meta_words;                                                             /**< rspamd_stat_token_t produced from meta headers
+                                                                                                               (e.g. Subject) */
 
        GPtrArray *rcpt_mime;
        GPtrArray *rcpt_envelope;                                               /**< array of rspamd_email_address                                      */
index b912f8d203dc424b57ab5cc690c642d0d14f90b9..ee8db8af2e452ee6993aa758879abbbd1e3ee5e4 100644 (file)
@@ -30,7 +30,7 @@
 #define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
 #define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
 #define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
-#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1u << 4)
+#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
 #define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
 #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
 #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
index ed3f78fdeedf29aaec302c812b680aed43e0eb92..d601dbee93d364f9f71f9d351128a12f71e0132a 100644 (file)
@@ -126,7 +126,6 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        struct rspamd_mime_text_part *part;
        rspamd_cryptobox_hash_state_t hst;
        rspamd_token_t *st_tok;
-       GArray *words;
        guint i, reserved_len = 0;
        gdouble *pdiff;
        guchar hout[rspamd_cryptobox_HASHBYTES];
@@ -170,19 +169,13 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                }
        }
 
-       if (task->subject != NULL) {
-               words = rspamd_tokenize_subject (task);
-               if (words != NULL) {
-                       st_ctx->tokenizer->tokenize_func (st_ctx,
-                                       task,
-                                       words,
-                                       TRUE,
-                                       "SUBJECT",
-                                       task->tokens);
-
-                       rspamd_mempool_add_destructor (task->task_pool,
-                                       rspamd_array_free_hard, words);
-               }
+       if (task->meta_words != NULL) {
+               st_ctx->tokenizer->tokenize_func (st_ctx,
+                               task,
+                               task->meta_words,
+                               TRUE,
+                               "SUBJECT",
+                               task->tokens);
        }
 
        rspamd_stat_tokenize_parts_metadata (st_ctx, task);
index 604fc070e15ef669657db478ce657c409382564f..dcd5569109c4997a320ae17872fd714b9ce6c149 100644 (file)
@@ -251,7 +251,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
                                          enum rspamd_tokenize_type how,
                                          struct rspamd_config *cfg,
                                          GList *exceptions,
-                                         guint64 *hash)
+                                         guint64 *hash,
+                                         GArray *cur_words)
 {
        rspamd_stat_token_t token, buf;
        const gchar *pos = NULL;
@@ -265,7 +266,7 @@ rspamd_tokenize_text (const gchar *text, gsize len,
        static UBreakIterator* bi = NULL;
 
        if (text == NULL) {
-               return NULL;
+               return cur_words;
        }
 
        buf.original.begin = text;
@@ -281,8 +282,13 @@ rspamd_tokenize_text (const gchar *text, gsize len,
                initial_size = word_decay * 2;
        }
 
-       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
-                       initial_size);
+       if (!cur_words) {
+               res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
+                               initial_size);
+       }
+       else {
+               res = cur_words;
+       }
 
        if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
                while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
@@ -474,71 +480,96 @@ start_over:
 
 #undef SHIFT_EX
 
-GArray *
-rspamd_tokenize_subject (struct rspamd_task *task)
+static void
+rspamd_add_metawords_from_str (const gchar *beg, gsize len,
+                                                               struct rspamd_task *task)
 {
        UText utxt = UTEXT_INITIALIZER;
        UErrorCode uc_err = U_ZERO_ERROR;
-       gsize slen;
-       gboolean valid_utf = TRUE;
-       GArray *words = NULL;
        guint i = 0;
-       gint32 uc;
-       rspamd_stat_token_t *tok;
+       UChar32 uc;
+       gboolean valid_utf = TRUE;
 
-       if (task->subject) {
-               const gchar *p = task->subject;
+       while (i < len) {
+               U8_NEXT (beg, i, len, uc);
 
-               slen = strlen (task->subject);
+               if (((gint32) uc) < 0) {
+                       valid_utf = FALSE;
+                       break;
+               }
 
-               while (i < slen) {
-                       U8_NEXT (p, i, slen, uc);
+#if U_ICU_VERSION_MAJOR_NUM < 50
+               if (u_isalpha (uc)) {
+                       gint32 sc = ublock_getCode (uc);
 
-                       if (((gint32) uc) < 0) {
+                       if (sc == UBLOCK_THAI) {
                                valid_utf = FALSE;
+                               msg_info_task ("enable workaround for Thai characters for old libicu");
                                break;
                        }
-#if U_ICU_VERSION_MAJOR_NUM < 50
-                       if (u_isalpha (uc)) {
-                               gint32 sc = ublock_getCode (uc);
-
-                               if (sc == UBLOCK_THAI) {
-                                       valid_utf = FALSE;
-                                       msg_info_task ("enable workaround for Thai characters for old libicu");
-                                       break;
-                               }
-                       }
-#endif
                }
+#endif
+       }
 
-               if (valid_utf) {
-                       utext_openUTF8 (&utxt,
-                                       task->subject,
-                                       slen,
-                                       &uc_err);
+       if (valid_utf) {
+               utext_openUTF8 (&utxt,
+                               beg,
+                               len,
+                               &uc_err);
 
-                       words = rspamd_tokenize_text (task->subject, slen,
-                                       &utxt, RSPAMD_TOKENIZE_UTF,
-                                       task->cfg, NULL, NULL);
+               task->meta_words = rspamd_tokenize_text (beg, len,
+                               &utxt, RSPAMD_TOKENIZE_UTF,
+                               task->cfg, NULL, NULL, task->meta_words);
 
-                       utext_close (&utxt);
-               }
-               else {
-                       words = rspamd_tokenize_text (task->subject, slen,
-                                       NULL, RSPAMD_TOKENIZE_RAW,
-                                       task->cfg, NULL, NULL);
-               }
+               utext_close (&utxt);
        }
+       else {
+               task->meta_words = rspamd_tokenize_text (beg, len,
+                               NULL, RSPAMD_TOKENIZE_RAW,
+                               task->cfg, NULL, NULL, task->meta_words);
+       }
+}
+
+void
+rspamd_tokenize_meta_words (struct rspamd_task *task)
+{
+       guint i = 0;
+       rspamd_stat_token_t *tok;
+
+       if (task->subject) {
+               rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task);
+       }
+
+       if (task->from_mime) {
+               struct rspamd_email_address *addr;
 
-       if (words != NULL) {
+               addr = g_ptr_array_index (task->from_mime, 0);
 
-               for (i = 0; i < words->len; i++) {
-                       tok = &g_array_index (words, rspamd_stat_token_t, i);
-                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+               if (addr->name) {
+                       rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task);
                }
        }
 
-       return words;
+       if (task->meta_words != NULL) {
+               const gchar *language = NULL;
+
+               if (task->text_parts && task->text_parts->len > 0) {
+                       struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0);
+
+                       if (tp->language) {
+                               language = tp->language;
+                       }
+               }
+
+               rspamd_normalize_words (task->meta_words, task->task_pool);
+               rspamd_stem_words (task->meta_words, task->task_pool, language,
+                               task->lang_det);
+
+               for (i = 0; i < task->meta_words->len; i++) {
+                       tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
+                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
+               }
+       }
 }
 
 static inline void
index 683d728ed0f3c79a48fd76f1bcbf1990d207fd56..784426d311d7322be195dd8d69edebd68900e5d7 100644 (file)
@@ -43,7 +43,8 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len,
                                                           enum rspamd_tokenize_type how,
                                                           struct rspamd_config *cfg,
                                                           GList *exceptions,
-                                                          guint64 *hash);
+                                                          guint64 *hash,
+                                                          GArray *cur_words);
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
@@ -64,7 +65,7 @@ void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
                                                const gchar *language,
                                                struct rspamd_lang_detector *d);
 
-GArray * rspamd_tokenize_subject (struct rspamd_task *task);
+void rspamd_tokenize_meta_words (struct rspamd_task *task);
 #endif
 /*
  * vi:ts=4
index 4f28e94922ccab268eace018c64b930c67982f98..1b5f33cb7d6dcb19bd7283ec9e60cc7ddec8d6ba 100644 (file)
@@ -4796,8 +4796,8 @@ lua_push_stat_token (lua_State *L, rspamd_token_t *tok)
                        lua_pushboolean (L, true);
                        lua_settable (L, -3);
                }
-               if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_SUBJECT) {
-                       lua_pushstring (L, "subject");
+               if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) {
+                       lua_pushstring (L, "header");
                        lua_pushboolean (L, true);
                        lua_settable (L, -3);
                }
index ec22b8a9a6bb59397fe1d3f495ddcf3517afbf11..a064bce5b10b6bc2612c409c813bf26deaa33e27 100644 (file)
@@ -1144,7 +1144,7 @@ lua_util_tokenize_text (lua_State *L)
                        &utxt,
                        RSPAMD_TOKENIZE_UTF, NULL,
                        exceptions,
-                       NULL);
+                       NULL, NULL);
 
        if (res == NULL) {
                lua_pushnil (L);
index c566cc517b32f49ba67aa56bc87f116187d263a3..e48d19ea2dab49a04f5f4bb88e937adec87f2ee2 100644 (file)
@@ -621,35 +621,27 @@ chartable_symbol_callback (struct rspamd_task *task,
                rspamd_chartable_process_part (task, part, chartable_module_ctx);
        }
 
-       if (task->subject != NULL) {
-               GArray *words;
+       if (task->meta_words != NULL) {
                rspamd_stat_token_t *w;
-               gdouble cur_score = 0.0;
+               gdouble cur_score = 0;
+               gsize arlen = task->meta_words->len;
 
-               words = rspamd_tokenize_subject (task);
-
-               if (words && words->len > 0) {
-                       for (i = 0; i < words->len; i++) {
-                               w = &g_array_index (words, rspamd_stat_token_t, i);
-                               cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
-                                               NULL, chartable_module_ctx);
-                       }
-
-                       cur_score /= (gdouble)words->len;
-
-                       if (cur_score > 2.0) {
-                               cur_score = 2.0;
-                       }
+               for (i = 0; i < arlen; i++) {
+                       w = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
+                       cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
+                                       NULL, chartable_module_ctx);
+               }
 
-                       if (cur_score > chartable_module_ctx->threshold) {
-                               rspamd_task_insert_result (task, chartable_module_ctx->symbol,
-                                               cur_score, "subject");
+               cur_score /= (gdouble)arlen;
 
-                       }
+               if (cur_score > 2.0) {
+                       cur_score = 2.0;
                }
 
-               if (words) {
-                       g_array_free (words, TRUE);
+               if (cur_score > chartable_module_ctx->threshold) {
+                       rspamd_task_insert_result (task, chartable_module_ctx->symbol,
+                                       cur_score, "subject");
+
                }
        }