From d150a467bbf10dc207b8b8322d029cf4d37ab109 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 27 Apr 2016 16:05:15 +0100 Subject: [PATCH] [Fix] Fix and rescore R_PARTS_DIFFER logic Signed-off-by: Vsevolod Stakhov --- rules/misc.lua | 21 +++++++++++++++------ src/libmime/message.c | 23 ++++++++++++++++------- src/libmime/mime_expressions.c | 13 +++++++------ src/libstat/stat_process.c | 6 +++--- 4 files changed, 41 insertions(+), 22 deletions(-) diff --git a/rules/misc.lua b/rules/misc.lua index b3926e46b..2c6d50317 100644 --- a/rules/misc.lua +++ b/rules/misc.lua @@ -33,15 +33,24 @@ reconf['R_FLASH_REDIR_IMGSHACK'] = '/^(?:http:\\/\\/)?img\\d{1,5}\\.imageshack\\ -- Different text parts rspamd_config.R_PARTS_DIFFER = function(task) - local distance = task:get_mempool():get_variable('parts_distance', 'int') + local distance = task:get_mempool():get_variable('parts_distance', 'double') if distance then local nd = tonumber(distance) - - if nd < 50 then - local score = 1 - util.tanh(nd / 100.0) - - task:insert_result('R_PARTS_DIFFER', score, tostring(nd) .. '%') + -- ND is relation of different words to total words + if nd >= 0.5 then + local tw = task:get_mempool():get_variable('total_words', 'int') + + if tw then + if tw > 30 then + -- We are confident about difference + local score = (nd - 0.5) * 2.0 + else + -- We are not so confident about difference + local score = (nd - 0.5) + end + task:insert_result('R_PARTS_DIFFER', score, tostring(100.0 * nd) .. '%') + end end end diff --git a/src/libmime/message.c b/src/libmime/message.c index cd18f6ce4..444d5c7d7 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1566,8 +1566,9 @@ rspamd_message_parse (struct rspamd_task *task) const gchar *p; gsize len; goffset hdr_pos; - gint diff, *pdiff, i; - guint tw, dw; + gint i; + gdouble diff, *pdiff; + guint tw, *ptw, dw; tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray)); p = task->msg.begin; @@ -1843,26 +1844,34 @@ rspamd_message_parse (struct rspamd_task *task) if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) && p1->normalized_words && p2->normalized_words) { - tw = MAX (p1->normalized_words->len, p2->normalized_words->len); + tw = p1->normalized_words->len + p2->normalized_words->len; if (tw > 0) { dw = rspamd_words_levenshtein_distance (task, p1->normalized_words, p2->normalized_words); - diff = (100.0 * (gdouble)(tw - dw) / (gdouble)tw); + diff = (2.0 * (gdouble)dw) / (gdouble)tw; - debug_task ( + msg_err_task ( "different words: %d, total words: %d, " - "got likeliness between parts of %d%%", + "got diff between parts of %.2f", dw, tw, diff); - pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint)); + pdiff = rspamd_mempool_alloc (task->task_pool, + sizeof (gdouble)); *pdiff = diff; rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, NULL); + ptw = rspamd_mempool_alloc (task->task_pool, + sizeof (gint)); + *ptw = tw; + rspamd_mempool_set_variable (task->task_pool, + "total_words", + ptw, + NULL); } } } diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index ea8af2dcd..c107703a9 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -955,9 +955,9 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused) gboolean rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused) { - gint threshold, threshold2 = -1, diff; + gint threshold, threshold2 = -1; struct expression_argument *arg; - gint *pdiff; + gdouble *pdiff, diff; if (args == NULL || args->len == 0) { debug_task ("no threshold is specified, assume it 100"); @@ -997,12 +997,13 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused) if ((pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance")) != NULL) { - diff = *pdiff; + diff = (1.0 - (*pdiff)) * 100.0; + if (diff != -1) { if (threshold2 > 0) { - if (diff >= - MIN (threshold, - threshold2) && diff < MAX (threshold, threshold2)) { + if (diff >= MIN (threshold, threshold2) && + diff < MAX (threshold, threshold2)) { + return TRUE; } } diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index e6d34e406..486d82c08 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -28,7 +28,7 @@ #define RSPAMD_LEARN_OP 1 #define RSPAMD_UNLEARN_OP 2 -static const gint similarity_treshold = 80; +static const gdouble similarity_treshold = 80.0; static void rspamd_stat_tokenize_header (struct rspamd_task *task, @@ -173,7 +173,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, GArray *words; gchar *sub; guint i, reserved_len = 0; - gint *pdiff; + gdouble *pdiff; for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); @@ -200,7 +200,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } - if (pdiff != NULL && *pdiff > similarity_treshold) { + if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) { msg_debug_task ("message has two common parts (%d%%), so skip the last one", *pdiff); break; -- 2.39.5