diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-14 23:58:56 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-14 23:58:56 +0100 |
commit | d32b1c887e86319691da76a6ee8d6ee51b17dcab (patch) | |
tree | d33c362721c32ffd60a98dc900164dc0cbb92022 /src/libmime | |
parent | b98130a443131de8ed80cfdddfc70b3c4ccb9fc4 (diff) | |
download | rspamd-d32b1c887e86319691da76a6ee8d6ee51b17dcab.tar.gz rspamd-d32b1c887e86319691da76a6ee8d6ee51b17dcab.zip |
Move distance calculation to message parsing.
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/message.c | 98 | ||||
-rw-r--r-- | src/libmime/mime_expressions.c | 132 |
2 files changed, 97 insertions, 133 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index b1d80f7e9..fde23ccb2 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1254,6 +1254,49 @@ rspamd_normalize_text_part (struct rspamd_task *task, } } +#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) + +static gint +rspamd_words_levenshtein_distance (GArray *w1, GArray *w2) +{ + guint s1len, s2len, x, y, lastdiag, olddiag; + guint *column; + rspamd_fstring_t *s1, *s2; + gint eq; + static const guint max_words = 8192; + + s1len = w1->len; + s2len = w2->len; + + if (s1len > max_words) { + msg_err ("cannot compare parts with more than %ud words: %ud", + max_words, s1len); + return 0; + } + + column = g_alloca ((s1len + 1) * sizeof (guint)); + + for (y = 1; y <= s1len; y++) { + column[y] = y; + } + + for (x = 1; x <= s2len; x++) { + column[0] = x; + + for (y = 1, lastdiag = x - 1; y <= s1len; y++) { + olddiag = column[y]; + s1 = &g_array_index (w1, rspamd_fstring_t, y - 1); + s2 = &g_array_index (w1, rspamd_fstring_t, x - 1); + eq = rspamd_fstring_equal (s1, s2) ? 0 : 1; + column[y] = MIN3 (column[y] + 1, column[y - 1] + 1, + lastdiag + (eq)); + lastdiag = olddiag; + } + } + + return column[s1len]; +} + static int rspamd_gtube_cb (int strnum, int textpos, void *context) { @@ -1624,6 +1667,9 @@ rspamd_message_parse (struct rspamd_task *task) GList *first, *cur; GMimePart *part; GMimeDataWrapper *wrapper; + GMimeObject *parent; + const GMimeContentType *ct; + struct mime_text_part *p1, *p2; struct mime_foreach_data md; struct received_header *recv; gchar *mid, *url_str; @@ -1631,7 +1677,8 @@ rspamd_message_parse (struct rspamd_task *task) struct rspamd_url *subject_url; gsize len; gint64 hdr_start, hdr_end; - gint rc, state = 0; + gint rc, state = 0, diff, *pdiff; + guint tw, dw; tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray)); p = task->msg.start; @@ -1863,6 +1910,55 @@ rspamd_message_parse (struct rspamd_task *task) } } + /* Calculate distance for 2-parts messages */ + if (task->text_parts->len == 2) { + p1 = g_ptr_array_index (task->text_parts, 0); + p2 = g_ptr_array_index (task->text_parts, 1); + + /* First of all check parent object */ + if (p1->parent && p1->parent == p2->parent) { + parent = p1->parent; + ct = g_mime_object_get_content_type (parent); + if (ct == NULL || + !g_mime_content_type_is_type ((GMimeContentType *)ct, + "multipart", "alternative")) { + debug_task ( + "two parts are not belong to multipart/alternative container, skip check"); + } + } + else { + debug_task ( + "message contains two parts but they are in different multi-parts"); + } + + if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) && + p1->normalized_words && p2->normalized_words) { + + tw = MAX (p1->normalized_words->len, p2->normalized_words->len); + dw = rspamd_words_levenshtein_distance (p1->normalized_words, + p2->normalized_words); + diff = tw > 0 ? (100.0 * (gdouble)(tw - dw) / (gdouble)tw) : 100; + + msg_info ( + "different words: %d, total words: %d, " + "got likeliness between parts of %d%%", + dw, tw, + diff); + + pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint)); + *pdiff = diff; + rspamd_mempool_set_variable (task->task_pool, + "parts_distance", + pdiff, + NULL); + } + } + else { + debug_task ( + "message has too many text parts, so do not try to compare " + "them with each other"); + } + return TRUE; } diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index 446493b4d..be49f11d8 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -1165,42 +1165,6 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused) return FALSE; } -#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) - -static gint -rspamd_words_levenshtein_distance (GArray *w1, GArray *w2) -{ - guint s1len, s2len, x, y, lastdiag, olddiag; - guint *column; - rspamd_fstring_t *s1, *s2; - gint eq; - - s1len = w1->len; - s2len = w2->len; - - column = g_alloca ((s1len + 1) * sizeof (guint)); - - for (y = 1; y <= s1len; y++) { - column[y] = y; - } - - for (x = 1; x <= s2len; x++) { - column[0] = x; - - for (y = 1, lastdiag = x - 1; y <= s1len; y++) { - olddiag = column[y]; - s1 = &g_array_index (w1, rspamd_fstring_t, y - 1); - s2 = &g_array_index (w1, rspamd_fstring_t, x - 1); - eq = rspamd_fstring_equal (s1, s2) ? 0 : 1; - column[y] = MIN3 (column[y] + 1, column[y - 1] + 1, - lastdiag + (eq)); - lastdiag = olddiag; - } - } - - return column[s1len]; -} - /* * This function is designed to find difference between text/html and text/plain parts @@ -1212,11 +1176,7 @@ gboolean rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused) { gint threshold, threshold2 = -1, diff; - struct mime_text_part *p1, *p2; struct expression_argument *arg; - GMimeObject *parent; - const GMimeContentType *ct; - guint tw, dw; gint *pdiff; if (args == NULL || args->len == 0) { @@ -1278,98 +1238,6 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused) } } - if (task->text_parts->len == 2) { - p1 = g_ptr_array_index (task->text_parts, 0); - p2 = g_ptr_array_index (task->text_parts, 1); - pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint)); - *pdiff = -1; - - /* First of all check parent object */ - if (p1->parent && p1->parent == p2->parent) { - parent = p1->parent; - ct = g_mime_object_get_content_type (parent); -#ifndef GMIME24 - if (ct == NULL || - !g_mime_content_type_is_type (ct, "multipart", "alternative")) { -#else - if (ct == NULL || - !g_mime_content_type_is_type ((GMimeContentType *)ct, - "multipart", "alternative")) { -#endif - debug_task ( - "two parts are not belong to multipart/alternative container, skip check"); - rspamd_mempool_set_variable (task->task_pool, - "parts_distance", - pdiff, - NULL); - return FALSE; - } - } - else { - debug_task ( - "message contains two parts but they are in different multi-parts"); - rspamd_mempool_set_variable (task->task_pool, - "parts_distance", - pdiff, - NULL); - return FALSE; - } - if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) && - p1->normalized_words && p2->normalized_words) { - - tw = MAX (p1->normalized_words->len, p2->normalized_words->len); - dw = rspamd_words_levenshtein_distance (p1->normalized_words, - p2->normalized_words); - diff = tw > 0 ? (100.0 * (gdouble)(tw - dw) / (gdouble)tw) : 100; - - msg_debug ( - "different words: %d, total words: %d, " - "got likeliness between parts of %d%%, threshold is %d%%", - dw, tw, - diff, - threshold); - - *pdiff = diff; - rspamd_mempool_set_variable (task->task_pool, - "parts_distance", - pdiff, - NULL); - if (threshold2 > 0) { - if (diff >= - MIN (threshold, - threshold2) && diff < MAX (threshold, threshold2)) { - return TRUE; - } - } - else { - if (diff <= threshold) { - return TRUE; - } - } - } - else if ((IS_PART_EMPTY (p1) && - !IS_PART_EMPTY (p2)) || (!IS_PART_EMPTY (p1)&& IS_PART_EMPTY (p2))) { - /* Empty and non empty parts are different */ - *pdiff = 0; - rspamd_mempool_set_variable (task->task_pool, - "parts_distance", - pdiff, - NULL); - return TRUE; - } - } - else { - debug_task ( - "message has too many text parts, so do not try to compare them with each other"); - rspamd_mempool_set_variable (task->task_pool, - "parts_distance", - pdiff, - NULL); - return FALSE; - } - - rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, - NULL); return FALSE; } |