From a8e5e53c38ec2289c240008bf918ec086ec7a08a Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 6 Jun 2018 16:35:57 +0100 Subject: [PATCH] [Feature] Split message parsing and processing --- src/libmime/message.c | 84 +++++++++++++++++++++------------------ src/libmime/message.h | 6 +++ src/libserver/task.c | 6 +++ src/libserver/task.h | 31 ++++++++------- src/lua/lua_task.c | 3 +- src/plugins/fuzzy_check.c | 2 + src/rspamadm/lua_repl.c | 1 + 7 files changed, 79 insertions(+), 54 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 5681c3d00..e5e43c5be 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -907,13 +907,10 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start, gboolean rspamd_message_parse (struct rspamd_task *task) { - struct rspamd_mime_text_part *p1, *p2; struct received_header *recv, *trecv; const gchar *p; gsize len; guint i; - gdouble diff, *pdiff; - guint tw, *ptw, dw; GError *err = NULL; rspamd_cryptobox_hash_state_t st; guchar digest_out[rspamd_cryptobox_HASHBYTES]; @@ -1016,16 +1013,6 @@ rspamd_message_parse (struct rspamd_task *task) task->queue_id = "undef"; } - for (i = 0; i < task->parts->len; i ++) { - struct rspamd_mime_part *part; - - part = g_ptr_array_index (task->parts, i); - rspamd_message_process_text_part (task, part); - } - - rspamd_images_process (task); - rspamd_archives_process (task); - if (task->received->len > 0) { gboolean need_recv_correction = FALSE; rspamd_inet_addr_t *raddr; @@ -1130,6 +1117,50 @@ rspamd_message_parse (struct rspamd_task *task) rspamd_url_task_subject_callback, task); } + for (i = 0; i < task->parts->len; i ++) { + struct rspamd_mime_part *part; + + part = g_ptr_array_index (task->parts, i); + rspamd_cryptobox_hash_update (&st, part->digest, sizeof (part->digest)); + } + + rspamd_cryptobox_hash_final (&st, digest_out); + memcpy (task->digest, digest_out, sizeof (task->digest)); + + if (task->queue_id) { + msg_info_task ("loaded message; id: <%s>; queue-id: <%s>; size: %z; " + "checksum: <%*xs>", + task->message_id, task->queue_id, task->msg.len, + (gint)sizeof (task->digest), task->digest); + } + else { + msg_info_task ("loaded message; id: <%s>; size: %z; " + "checksum: <%*xs>", + task->message_id, task->msg.len, + (gint)sizeof (task->digest), task->digest); + } + + return TRUE; +} + +void +rspamd_message_process (struct rspamd_task *task) +{ + guint i; + struct rspamd_mime_text_part *p1, *p2; + gdouble diff, *pdiff; + guint tw, *ptw, dw; + + for (i = 0; i < task->parts->len; i ++) { + struct rspamd_mime_part *part; + + part = g_ptr_array_index (task->parts, i); + rspamd_message_process_text_part (task, part); + } + + rspamd_images_process (task); + rspamd_archives_process (task); + /* Calculate distance for 2-parts messages */ if (task->text_parts->len == 2) { p1 = g_ptr_array_index (task->text_parts, 0); @@ -1144,7 +1175,7 @@ rspamd_message_parse (struct rspamd_task *task) if (rspamd_ftok_cmp (&p1->mime_part->parent_part->ct->subtype, &srch) == 0) { if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) && - p1->normalized_hashes && p2->normalized_hashes) { + p1->normalized_hashes && p2->normalized_hashes) { /* * We also detect language on one part and propagate it to * another one @@ -1219,13 +1250,6 @@ rspamd_message_parse (struct rspamd_task *task) } } - for (i = 0; i < task->parts->len; i ++) { - struct rspamd_mime_part *part; - - part = g_ptr_array_index (task->parts, i); - rspamd_cryptobox_hash_update (&st, part->digest, sizeof (part->digest)); - } - /* Calculate average words length and number of short words */ struct rspamd_mime_text_part *text_part; gdouble *var; @@ -1258,24 +1282,6 @@ rspamd_message_parse (struct rspamd_task *task) *var /= (double)total_words; } } - - rspamd_cryptobox_hash_final (&st, digest_out); - memcpy (task->digest, digest_out, sizeof (task->digest)); - - if (task->queue_id) { - msg_info_task ("loaded message; id: <%s>; queue-id: <%s>; size: %z; " - "checksum: <%*xs>", - task->message_id, task->queue_id, task->msg.len, - (gint)sizeof (task->digest), task->digest); - } - else { - msg_info_task ("loaded message; id: <%s>; size: %z; " - "checksum: <%*xs>", - task->message_id, task->msg.len, - (gint)sizeof (task->digest), task->digest); - } - - return TRUE; } diff --git a/src/libmime/message.h b/src/libmime/message.h index 0ed2a5c66..b16011666 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -142,6 +142,12 @@ struct received_header { */ gboolean rspamd_message_parse (struct rspamd_task *task); +/** + * Process content in task (e.g. HTML parsing) + * @param task + */ +void rspamd_message_process (struct rspamd_task *task); + /** * Get an array of header's values with specified header's name using raw headers * @param task worker task structure diff --git a/src/libserver/task.c b/src/libserver/task.c index b5594816b..9be780b1b 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -738,6 +738,12 @@ rspamd_task_process (struct rspamd_task *task, guint stages) RSPAMD_TASK_STAGE_PRE_FILTERS); break; + case RSPAMD_TASK_STAGE_PROCESS_MESSAGE: + if (!(task->flags & RSPAMD_TASK_FLAG_SKIP_PROCESS)) { + rspamd_message_process (task); + } + break; + case RSPAMD_TASK_STAGE_FILTERS: rspamd_symbols_cache_process_symbols (task, task->cfg->cache, RSPAMD_TASK_STAGE_FILTERS); diff --git a/src/libserver/task.h b/src/libserver/task.h index e3c0492f6..b9fd2f811 100644 --- a/src/libserver/task.h +++ b/src/libserver/task.h @@ -41,25 +41,27 @@ enum rspamd_task_stage { RSPAMD_TASK_STAGE_ENVELOPE = (1 << 1), RSPAMD_TASK_STAGE_READ_MESSAGE = (1 << 2), RSPAMD_TASK_STAGE_PRE_FILTERS = (1 << 3), - RSPAMD_TASK_STAGE_FILTERS = (1 << 4), - RSPAMD_TASK_STAGE_CLASSIFIERS_PRE = (1 << 5), - RSPAMD_TASK_STAGE_CLASSIFIERS = (1 << 6), - RSPAMD_TASK_STAGE_CLASSIFIERS_POST = (1 << 7), - RSPAMD_TASK_STAGE_COMPOSITES = (1 << 8), - RSPAMD_TASK_STAGE_POST_FILTERS = (1 << 9), - RSPAMD_TASK_STAGE_LEARN_PRE = (1 << 10), - RSPAMD_TASK_STAGE_LEARN = (1 << 11), - RSPAMD_TASK_STAGE_LEARN_POST = (1 << 12), - RSPAMD_TASK_STAGE_COMPOSITES_POST = (1 << 13), - RSPAMD_TASK_STAGE_IDEMPOTENT = (1 << 14), - RSPAMD_TASK_STAGE_DONE = (1 << 15), - RSPAMD_TASK_STAGE_REPLIED = (1 << 16) + RSPAMD_TASK_STAGE_PROCESS_MESSAGE = (1 << 4), + RSPAMD_TASK_STAGE_FILTERS = (1 << 5), + RSPAMD_TASK_STAGE_CLASSIFIERS_PRE = (1 << 6), + RSPAMD_TASK_STAGE_CLASSIFIERS = (1 << 7), + RSPAMD_TASK_STAGE_CLASSIFIERS_POST = (1 << 8), + RSPAMD_TASK_STAGE_COMPOSITES = (1 << 9), + RSPAMD_TASK_STAGE_POST_FILTERS = (1 << 10), + RSPAMD_TASK_STAGE_LEARN_PRE = (1 << 11), + RSPAMD_TASK_STAGE_LEARN = (1 << 12), + RSPAMD_TASK_STAGE_LEARN_POST = (1 << 13), + RSPAMD_TASK_STAGE_COMPOSITES_POST = (1 << 14), + RSPAMD_TASK_STAGE_IDEMPOTENT = (1 << 15), + RSPAMD_TASK_STAGE_DONE = (1 << 16), + RSPAMD_TASK_STAGE_REPLIED = (1 << 17) }; #define RSPAMD_TASK_PROCESS_ALL (RSPAMD_TASK_STAGE_CONNECT | \ RSPAMD_TASK_STAGE_ENVELOPE | \ RSPAMD_TASK_STAGE_READ_MESSAGE | \ RSPAMD_TASK_STAGE_PRE_FILTERS | \ + RSPAMD_TASK_STAGE_PROCESS_MESSAGE | \ RSPAMD_TASK_STAGE_FILTERS | \ RSPAMD_TASK_STAGE_CLASSIFIERS_PRE | \ RSPAMD_TASK_STAGE_CLASSIFIERS | \ @@ -75,6 +77,7 @@ enum rspamd_task_stage { #define RSPAMD_TASK_PROCESS_LEARN (RSPAMD_TASK_STAGE_CONNECT | \ RSPAMD_TASK_STAGE_ENVELOPE | \ RSPAMD_TASK_STAGE_READ_MESSAGE | \ + RSPAMD_TASK_STAGE_PROCESS_MESSAGE | \ RSPAMD_TASK_STAGE_CLASSIFIERS_PRE | \ RSPAMD_TASK_STAGE_CLASSIFIERS | \ RSPAMD_TASK_STAGE_CLASSIFIERS_POST | \ @@ -85,7 +88,7 @@ enum rspamd_task_stage { #define RSPAMD_TASK_FLAG_MIME (1 << 0) #define RSPAMD_TASK_FLAG_JSON (1 << 1) -#define RSPAMD_TASK_FLAG_SKIP_EXTRA (1 << 2) +#define RSPAMD_TASK_FLAG_SKIP_PROCESS (1 << 2) #define RSPAMD_TASK_FLAG_SKIP (1 << 3) #define RSPAMD_TASK_FLAG_EXT_URLS (1 << 4) #define RSPAMD_TASK_FLAG_SPAMC (1 << 5) diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 3aa24e88b..85459d440 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -1096,8 +1096,9 @@ lua_task_process_message (lua_State *L) if (task != NULL) { if (task->msg.len > 0) { - if (rspamd_message_parse (task) == 0) { + if (rspamd_message_parse (task)) { lua_pushboolean (L, TRUE); + rspamd_message_process (task); } else { lua_pushboolean (L, FALSE); diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 0873d3c7a..c4318777f 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -3019,6 +3019,8 @@ fuzzy_process_handler (struct rspamd_http_connection_entry *conn_ent, "Message processing error"); return; } + + rspamd_message_process (task); } PTR_ARRAY_FOREACH (fuzzy_module_ctx->fuzzy_rules, i, rule) { diff --git a/src/rspamadm/lua_repl.c b/src/rspamadm/lua_repl.c index e2f3a8581..a807101e0 100644 --- a/src/rspamadm/lua_repl.c +++ b/src/rspamadm/lua_repl.c @@ -390,6 +390,7 @@ rspamadm_lua_message_handler (lua_State *L, gint argc, gchar **argv) continue; } + rspamd_message_process (task); lua_pushcfunction (L, &rspamd_lua_traceback); err_idx = lua_gettop (L); -- 2.39.5