From 00e5f24b527fab74d6447733025ef5a18a814018 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 14 Jul 2011 10:02:54 +0400 Subject: Change logic of params inside compare parts distance. During learning and classifying compare parts using new algorithm. Raise similarity factor. --- src/expressions.c | 4 ++-- src/filter.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/src/expressions.c b/src/expressions.c index 3dfd542a4..b231bb309 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -1093,8 +1093,8 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused) debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold); *pdiff = diff; memory_pool_set_variable (task->task_pool, "parts_distance", pdiff, NULL); - if (threshold2 > 0 && threshold > threshold2) { - if (diff <= threshold && diff >= threshold2) { + if (threshold2 > 0 && threshold < threshold2) { + if (diff >= threshold && diff <= threshold2) { return TRUE; } } diff --git a/src/filter.c b/src/filter.c index 66f233115..8321e6d21 100644 --- a/src/filter.c +++ b/src/filter.c @@ -33,6 +33,7 @@ #include "settings.h" #include "view.h" #include "binlog.h" +#include "diff.h" #include "classifiers/classifiers.h" #include "tokenizers/tokenizers.h" @@ -40,7 +41,7 @@ # include "lua/lua_common.h" #endif -#define COMMON_PART_FACTOR 80 +#define COMMON_PART_FACTOR 95 static inline GQuark filter_error_quark (void) @@ -600,12 +601,13 @@ classifiers_callback (gpointer value, void *arg) struct worker_task *task = arg; struct classifier_config *cl = value; struct classifier_ctx *ctx; - struct mime_text_part *text_part; + struct mime_text_part *text_part, *p1, *p2; struct statfile *st; GTree *tokens = NULL; GList *cur; f_str_t c; gchar *header = NULL; + gint *dist = NULL, diff; gboolean is_twopart = FALSE; if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) { @@ -616,6 +618,7 @@ classifiers_callback (gpointer value, void *arg) } else { cur = g_list_first (task->text_parts); + dist = memory_pool_get_variable (task->task_pool, "parts_distance"); if (cur != NULL && cur->next != NULL && cur->next->next == NULL) { is_twopart = TRUE; } @@ -640,9 +643,24 @@ classifiers_callback (gpointer value, void *arg) cur = g_list_next (cur); continue; } - if (is_twopart && cur->next == NULL) { + if (dist != NULL && cur->next == NULL) { /* Compare part's content */ - if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) { + + if (*dist >= COMMON_PART_FACTOR) { + msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id); + break; + } + } + else if (cur->next == NULL && is_twopart) { + p1 = cur->prev->data; + p2 = text_part; + if (p1->diff_str != NULL && p2->diff_str != NULL) { + diff = compare_diff_distance (p1->diff_str, p2->diff_str); + } + else { + diff = fuzzy_compare_parts (p1, p2); + } + if (diff >= COMMON_PART_FACTOR) { msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id); break; } @@ -838,8 +856,10 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) struct statfile *st; stat_file_t *stf; gdouble sum; - struct mime_text_part *part; + struct mime_text_part *part, *p1, *p2; gboolean is_utf = FALSE, is_twopart = FALSE; + gint diff; + /* Load classifier by symbol */ cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile); @@ -883,7 +903,15 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) ex = part->urls_offset; if (is_twopart && cur->next == NULL) { /* Compare part's content */ - if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) { + p1 = cur->prev->data; + p2 = part; + if (p1->diff_str != NULL && p2->diff_str != NULL) { + diff = compare_diff_distance (p1->diff_str, p2->diff_str); + } + else { + diff = fuzzy_compare_parts (p1, p2); + } + if (diff >= COMMON_PART_FACTOR) { msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id); break; } @@ -951,8 +979,9 @@ learn_task_spam (struct classifier_config *cl, struct worker_task *task, gboolea struct classifier_ctx *cls_ctx; f_str_t c; GTree *tokens = NULL; - struct mime_text_part *part; + struct mime_text_part *part, *p1, *p2; gboolean is_utf = FALSE, is_twopart = FALSE; + gint diff; cur = g_list_first (task->text_parts); if (cur != NULL && cur->next != NULL && cur->next->next == NULL) { @@ -972,8 +1001,19 @@ learn_task_spam (struct classifier_config *cl, struct worker_task *task, gboolea is_utf = part->is_utf; ex = part->urls_offset; if (is_twopart && cur->next == NULL) { - /* Compare part's content */ - if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) { + /* + * Compare part's content + * Note: here we don't have filters proceeded this message, so using pool variable is a bad idea + */ + p1 = cur->prev->data; + p2 = part; + if (p1->diff_str != NULL && p2->diff_str != NULL) { + diff = compare_diff_distance (p1->diff_str, p2->diff_str); + } + else { + diff = fuzzy_compare_parts (p1, p2); + } + if (diff >= COMMON_PART_FACTOR) { msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id); break; } -- cgit v1.2.3