From 2dcb09b6eae0b21d4da68f9323be4cbfeb2b4237 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 26 Nov 2018 16:40:51 +0000 Subject: [PATCH] [Project] Use more generalised API to produce meta words --- src/libmime/message.c | 4 +- src/libserver/task.c | 4 + src/libserver/task.h | 2 + src/libstat/stat_api.h | 2 +- src/libstat/stat_process.c | 21 ++--- src/libstat/tokenizers/tokenizers.c | 127 +++++++++++++++++----------- src/libstat/tokenizers/tokenizers.h | 5 +- src/lua/lua_task.c | 4 +- src/lua/lua_util.c | 2 +- src/plugins/chartable.c | 38 ++++----- 10 files changed, 117 insertions(+), 92 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 994720f76..46f528ba7 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -173,7 +173,7 @@ rspamd_mime_part_create_words (struct rspamd_task *task, &part->utf_stripped_text, tok_type, task->cfg, part->exceptions, - NULL); + NULL, NULL); if (part->utf_words) { @@ -1278,6 +1278,8 @@ rspamd_message_process (struct rspamd_task *task) *var /= (double)total_words; } } + + rspamd_tokenize_meta_words (task); } diff --git a/src/libserver/task.c b/src/libserver/task.c index 6135bced4..664715fea 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -282,6 +282,10 @@ rspamd_task_free (struct rspamd_task *task) rspamd_email_address_free (task->from_envelope); } + if (task->meta_words) { + g_array_free (task->meta_words, TRUE); + } + ucl_object_unref (task->messages); if (task->re_rt) { diff --git a/src/libserver/task.h b/src/libserver/task.h index 005f6af26..b41b308e4 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -173,6 +173,8 @@ struct rspamd_task { struct rspamd_metric_result *result; /**< Metric result */ GHashTable *lua_cache; /**< cache of lua objects */ GPtrArray *tokens; /**< statistics tokens */ + GArray *meta_words; /**< rspamd_stat_token_t produced from meta headers + (e.g. Subject) */ GPtrArray *rcpt_mime; GPtrArray *rcpt_envelope; /**< array of rspamd_email_address */ diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index b912f8d20..ee8db8af2 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -30,7 +30,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1) #define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2) #define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3) -#define RSPAMD_STAT_TOKEN_FLAG_SUBJECT (1u << 4) +#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4) #define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5) #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6) #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7) diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index ed3f78fde..d601dbee9 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -126,7 +126,6 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, struct rspamd_mime_text_part *part; rspamd_cryptobox_hash_state_t hst; rspamd_token_t *st_tok; - GArray *words; guint i, reserved_len = 0; gdouble *pdiff; guchar hout[rspamd_cryptobox_HASHBYTES]; @@ -170,19 +169,13 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } } - if (task->subject != NULL) { - words = rspamd_tokenize_subject (task); - if (words != NULL) { - st_ctx->tokenizer->tokenize_func (st_ctx, - task, - words, - TRUE, - "SUBJECT", - task->tokens); - - rspamd_mempool_add_destructor (task->task_pool, - rspamd_array_free_hard, words); - } + if (task->meta_words != NULL) { + st_ctx->tokenizer->tokenize_func (st_ctx, + task, + task->meta_words, + TRUE, + "SUBJECT", + task->tokens); } rspamd_stat_tokenize_parts_metadata (st_ctx, task); diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 604fc070e..dcd556910 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -251,7 +251,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, - guint64 *hash) + guint64 *hash, + GArray *cur_words) { rspamd_stat_token_t token, buf; const gchar *pos = NULL; @@ -265,7 +266,7 @@ rspamd_tokenize_text (const gchar *text, gsize len, static UBreakIterator* bi = NULL; if (text == NULL) { - return NULL; + return cur_words; } buf.original.begin = text; @@ -281,8 +282,13 @@ rspamd_tokenize_text (const gchar *text, gsize len, initial_size = word_decay * 2; } - res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), - initial_size); + if (!cur_words) { + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t), + initial_size); + } + else { + res = cur_words; + } if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) { while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) { @@ -474,71 +480,96 @@ start_over: #undef SHIFT_EX -GArray * -rspamd_tokenize_subject (struct rspamd_task *task) +static void +rspamd_add_metawords_from_str (const gchar *beg, gsize len, + struct rspamd_task *task) { UText utxt = UTEXT_INITIALIZER; UErrorCode uc_err = U_ZERO_ERROR; - gsize slen; - gboolean valid_utf = TRUE; - GArray *words = NULL; guint i = 0; - gint32 uc; - rspamd_stat_token_t *tok; + UChar32 uc; + gboolean valid_utf = TRUE; - if (task->subject) { - const gchar *p = task->subject; + while (i < len) { + U8_NEXT (beg, i, len, uc); - slen = strlen (task->subject); + if (((gint32) uc) < 0) { + valid_utf = FALSE; + break; + } - while (i < slen) { - U8_NEXT (p, i, slen, uc); +#if U_ICU_VERSION_MAJOR_NUM < 50 + if (u_isalpha (uc)) { + gint32 sc = ublock_getCode (uc); - if (((gint32) uc) < 0) { + if (sc == UBLOCK_THAI) { valid_utf = FALSE; + msg_info_task ("enable workaround for Thai characters for old libicu"); break; } -#if U_ICU_VERSION_MAJOR_NUM < 50 - if (u_isalpha (uc)) { - gint32 sc = ublock_getCode (uc); - - if (sc == UBLOCK_THAI) { - valid_utf = FALSE; - msg_info_task ("enable workaround for Thai characters for old libicu"); - break; - } - } -#endif } +#endif + } - if (valid_utf) { - utext_openUTF8 (&utxt, - task->subject, - slen, - &uc_err); + if (valid_utf) { + utext_openUTF8 (&utxt, + beg, + len, + &uc_err); - words = rspamd_tokenize_text (task->subject, slen, - &utxt, RSPAMD_TOKENIZE_UTF, - task->cfg, NULL, NULL); + task->meta_words = rspamd_tokenize_text (beg, len, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL, task->meta_words); - utext_close (&utxt); - } - else { - words = rspamd_tokenize_text (task->subject, slen, - NULL, RSPAMD_TOKENIZE_RAW, - task->cfg, NULL, NULL); - } + utext_close (&utxt); } + else { + task->meta_words = rspamd_tokenize_text (beg, len, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL, task->meta_words); + } +} + +void +rspamd_tokenize_meta_words (struct rspamd_task *task) +{ + guint i = 0; + rspamd_stat_token_t *tok; + + if (task->subject) { + rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task); + } + + if (task->from_mime) { + struct rspamd_email_address *addr; - if (words != NULL) { + addr = g_ptr_array_index (task->from_mime, 0); - for (i = 0; i < words->len; i++) { - tok = &g_array_index (words, rspamd_stat_token_t, i); - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; + if (addr->name) { + rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task); } } - return words; + if (task->meta_words != NULL) { + const gchar *language = NULL; + + if (task->text_parts && task->text_parts->len > 0) { + struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0); + + if (tp->language) { + language = tp->language; + } + } + + rspamd_normalize_words (task->meta_words, task->task_pool); + rspamd_stem_words (task->meta_words, task->task_pool, language, + task->lang_det); + + for (i = 0; i < task->meta_words->len; i++) { + tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER; + } + } } static inline void diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 683d728ed..784426d31 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -43,7 +43,8 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len, enum rspamd_tokenize_type how, struct rspamd_config *cfg, GList *exceptions, - guint64 *hash); + guint64 *hash, + GArray *cur_words); /* OSB tokenize function */ gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, @@ -64,7 +65,7 @@ void rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, const gchar *language, struct rspamd_lang_detector *d); -GArray * rspamd_tokenize_subject (struct rspamd_task *task); +void rspamd_tokenize_meta_words (struct rspamd_task *task); #endif /* * vi:ts=4 diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 4f28e9492..1b5f33cb7 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -4796,8 +4796,8 @@ lua_push_stat_token (lua_State *L, rspamd_token_t *tok) lua_pushboolean (L, true); lua_settable (L, -3); } - if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_SUBJECT) { - lua_pushstring (L, "subject"); + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_HEADER) { + lua_pushstring (L, "header"); lua_pushboolean (L, true); lua_settable (L, -3); } diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index ec22b8a9a..a064bce5b 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -1144,7 +1144,7 @@ lua_util_tokenize_text (lua_State *L) &utxt, RSPAMD_TOKENIZE_UTF, NULL, exceptions, - NULL); + NULL, NULL); if (res == NULL) { lua_pushnil (L); diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index c566cc517..e48d19ea2 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -621,35 +621,27 @@ chartable_symbol_callback (struct rspamd_task *task, rspamd_chartable_process_part (task, part, chartable_module_ctx); } - if (task->subject != NULL) { - GArray *words; + if (task->meta_words != NULL) { rspamd_stat_token_t *w; - gdouble cur_score = 0.0; + gdouble cur_score = 0; + gsize arlen = task->meta_words->len; - words = rspamd_tokenize_subject (task); - - if (words && words->len > 0) { - for (i = 0; i < words->len; i++) { - w = &g_array_index (words, rspamd_stat_token_t, i); - cur_score += rspamd_chartable_process_word_utf (task, w, FALSE, - NULL, chartable_module_ctx); - } - - cur_score /= (gdouble)words->len; - - if (cur_score > 2.0) { - cur_score = 2.0; - } + for (i = 0; i < arlen; i++) { + w = &g_array_index (task->meta_words, rspamd_stat_token_t, i); + cur_score += rspamd_chartable_process_word_utf (task, w, FALSE, + NULL, chartable_module_ctx); + } - if (cur_score > chartable_module_ctx->threshold) { - rspamd_task_insert_result (task, chartable_module_ctx->symbol, - cur_score, "subject"); + cur_score /= (gdouble)arlen; - } + if (cur_score > 2.0) { + cur_score = 2.0; } - if (words) { - g_array_free (words, TRUE); + if (cur_score > chartable_module_ctx->threshold) { + rspamd_task_insert_result (task, chartable_module_ctx->symbol, + cur_score, "subject"); + } } -- 2.39.5