Change logic of params inside compare parts distance.

During learning and classifying compare parts using new algorithm. Raise similarity factor.
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-07-14 10:02:54 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-07-14 10:02:54 +0400
commit: 00e5f24b527fab74d6447733025ef5a18a814018 (patch)
tree: 02dab75ca73e0a99003946b273e4bc67edb783c0 /src
parent: b0ddff4f0d56a877305649a14b902b3f23140b4b (diff)
download: rspamd-00e5f24b527fab74d6447733025ef5a18a814018.tar.gz
rspamd-00e5f24b527fab74d6447733025ef5a18a814018.zip
2 files changed, 51 insertions, 11 deletions
diff --git a/src/expressions.c b/src/expressions.c
index 3dfd542a4..b231bb309 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -1093,8 +1093,8 @@ rspamd_parts_distance (struct worker_task * task, GList * args, void *unused)
 			debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold);
 			*pdiff = diff;
 			memory_pool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
-			if (threshold2 > 0 && threshold > threshold2) {
-				if (diff <= threshold && diff >= threshold2) {
+			if (threshold2 > 0 && threshold < threshold2) {
+				if (diff >= threshold && diff <= threshold2) {
 					return TRUE;
 				}
 			}
diff --git a/src/filter.c b/src/filter.c
index 66f233115..8321e6d21 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -33,6 +33,7 @@
 #include "settings.h"
 #include "view.h"
 #include "binlog.h"
+#include "diff.h"
 #include "classifiers/classifiers.h"
 #include "tokenizers/tokenizers.h"
 
@@ -40,7 +41,7 @@
 #   include "lua/lua_common.h"
 #endif
 
-#define COMMON_PART_FACTOR 80
+#define COMMON_PART_FACTOR 95
 
 static inline                   GQuark
 filter_error_quark (void)
@@ -600,12 +601,13 @@ classifiers_callback (gpointer value, void *arg)
 	struct worker_task             *task = arg;
 	struct classifier_config       *cl = value;
 	struct classifier_ctx          *ctx;
-	struct mime_text_part          *text_part;
+	struct mime_text_part          *text_part, *p1, *p2;
 	struct statfile                *st;
 	GTree                          *tokens = NULL;
 	GList                          *cur;
 	f_str_t                         c;
 	gchar                          *header = NULL;
+	gint                           *dist = NULL, diff;
 	gboolean                        is_twopart = FALSE;
 	
 	if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) {
@@ -616,6 +618,7 @@ classifiers_callback (gpointer value, void *arg)
 	}
 	else {
 		cur = g_list_first (task->text_parts);
+		dist =  memory_pool_get_variable (task->task_pool, "parts_distance");
 		if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
 			is_twopart = TRUE;
 		}
@@ -640,9 +643,24 @@ classifiers_callback (gpointer value, void *arg)
 					cur = g_list_next (cur);
 					continue;
 				}
-				if (is_twopart && cur->next == NULL) {
+				if (dist != NULL && cur->next == NULL) {
 					/* Compare part's content */
-					if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+
+					if (*dist >= COMMON_PART_FACTOR) {
+						msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+						break;
+					}
+				}
+				else if (cur->next == NULL && is_twopart) {
+					p1 = cur->prev->data;
+					p2 = text_part;
+					if (p1->diff_str != NULL && p2->diff_str != NULL) {
+						diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+					}
+					else {
+						diff = fuzzy_compare_parts (p1, p2);
+					}
+					if (diff >= COMMON_PART_FACTOR) {
 						msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
 						break;
 					}
@@ -838,8 +856,10 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 	struct statfile                *st;
 	stat_file_t                    *stf;
 	gdouble                         sum;
-	struct mime_text_part          *part;
+	struct mime_text_part          *part, *p1, *p2;
 	gboolean                        is_utf = FALSE, is_twopart = FALSE;
+	gint                            diff;
+
 
 	/* Load classifier by symbol */
 	cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
@@ -883,7 +903,15 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
 			ex = part->urls_offset;
 			if (is_twopart && cur->next == NULL) {
 				/* Compare part's content */
-				if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+				p1 = cur->prev->data;
+				p2 = part;
+				if (p1->diff_str != NULL && p2->diff_str != NULL) {
+					diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+				}
+				else {
+					diff = fuzzy_compare_parts (p1, p2);
+				}
+				if (diff >= COMMON_PART_FACTOR) {
 					msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
 					break;
 				}
@@ -951,8 +979,9 @@ learn_task_spam (struct classifier_config *cl, struct worker_task *task, gboolea
 	struct classifier_ctx          *cls_ctx;
 	f_str_t                         c;
 	GTree                          *tokens = NULL;
-	struct mime_text_part          *part;
+	struct mime_text_part          *part, *p1, *p2;
 	gboolean                        is_utf = FALSE, is_twopart = FALSE;
+	gint                            diff;
 
 	cur = g_list_first (task->text_parts);
 	if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
@@ -972,8 +1001,19 @@ learn_task_spam (struct classifier_config *cl, struct worker_task *task, gboolea
 		is_utf = part->is_utf;
 		ex = part->urls_offset;
 		if (is_twopart && cur->next == NULL) {
-			/* Compare part's content */
-			if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+			/*
+			 * Compare part's content
+			 * Note: here we don't have filters proceeded this message, so using pool variable is a bad idea
+			 */
+			p1 = cur->prev->data;
+			p2 = part;
+			if (p1->diff_str != NULL && p2->diff_str != NULL) {
+				diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+			}
+			else {
+				diff = fuzzy_compare_parts (p1, p2);
+			}
+			if (diff >= COMMON_PART_FACTOR) {
 				msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
 				break;
 			}
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-07-14 10:02:54 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-07-14 10:02:54 +0400
commit	00e5f24b527fab74d6447733025ef5a18a814018 (patch)
tree	02dab75ca73e0a99003946b273e4bc67edb783c0 /src
parent	b0ddff4f0d56a877305649a14b902b3f23140b4b (diff)
download	rspamd-00e5f24b527fab74d6447733025ef5a18a814018.tar.gz rspamd-00e5f24b527fab74d6447733025ef5a18a814018.zip