summaryrefslogtreecommitdiffstats
path: root/src/filter.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-14 10:02:54 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-14 10:02:54 +0400
commit00e5f24b527fab74d6447733025ef5a18a814018 (patch)
tree02dab75ca73e0a99003946b273e4bc67edb783c0 /src/filter.c
parentb0ddff4f0d56a877305649a14b902b3f23140b4b (diff)
downloadrspamd-00e5f24b527fab74d6447733025ef5a18a814018.tar.gz
rspamd-00e5f24b527fab74d6447733025ef5a18a814018.zip
Change logic of params inside compare parts distance.
During learning and classifying compare parts using new algorithm. Raise similarity factor.
Diffstat (limited to 'src/filter.c')
-rw-r--r--src/filter.c58
1 files changed, 49 insertions, 9 deletions
diff --git a/src/filter.c b/src/filter.c
index 66f233115..8321e6d21 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -33,6 +33,7 @@
#include "settings.h"
#include "view.h"
#include "binlog.h"
+#include "diff.h"
#include "classifiers/classifiers.h"
#include "tokenizers/tokenizers.h"
@@ -40,7 +41,7 @@
# include "lua/lua_common.h"
#endif
-#define COMMON_PART_FACTOR 80
+#define COMMON_PART_FACTOR 95
static inline GQuark
filter_error_quark (void)
@@ -600,12 +601,13 @@ classifiers_callback (gpointer value, void *arg)
struct worker_task *task = arg;
struct classifier_config *cl = value;
struct classifier_ctx *ctx;
- struct mime_text_part *text_part;
+ struct mime_text_part *text_part, *p1, *p2;
struct statfile *st;
GTree *tokens = NULL;
GList *cur;
f_str_t c;
gchar *header = NULL;
+ gint *dist = NULL, diff;
gboolean is_twopart = FALSE;
if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) {
@@ -616,6 +618,7 @@ classifiers_callback (gpointer value, void *arg)
}
else {
cur = g_list_first (task->text_parts);
+ dist = memory_pool_get_variable (task->task_pool, "parts_distance");
if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
is_twopart = TRUE;
}
@@ -640,9 +643,24 @@ classifiers_callback (gpointer value, void *arg)
cur = g_list_next (cur);
continue;
}
- if (is_twopart && cur->next == NULL) {
+ if (dist != NULL && cur->next == NULL) {
/* Compare part's content */
- if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+
+ if (*dist >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
+ else if (cur->next == NULL && is_twopart) {
+ p1 = cur->prev->data;
+ p2 = text_part;
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
+ if (diff >= COMMON_PART_FACTOR) {
msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
break;
}
@@ -838,8 +856,10 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
struct statfile *st;
stat_file_t *stf;
gdouble sum;
- struct mime_text_part *part;
+ struct mime_text_part *part, *p1, *p2;
gboolean is_utf = FALSE, is_twopart = FALSE;
+ gint diff;
+
/* Load classifier by symbol */
cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
@@ -883,7 +903,15 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
ex = part->urls_offset;
if (is_twopart && cur->next == NULL) {
/* Compare part's content */
- if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+ p1 = cur->prev->data;
+ p2 = part;
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
+ if (diff >= COMMON_PART_FACTOR) {
msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
break;
}
@@ -951,8 +979,9 @@ learn_task_spam (struct classifier_config *cl, struct worker_task *task, gboolea
struct classifier_ctx *cls_ctx;
f_str_t c;
GTree *tokens = NULL;
- struct mime_text_part *part;
+ struct mime_text_part *part, *p1, *p2;
gboolean is_utf = FALSE, is_twopart = FALSE;
+ gint diff;
cur = g_list_first (task->text_parts);
if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
@@ -972,8 +1001,19 @@ learn_task_spam (struct classifier_config *cl, struct worker_task *task, gboolea
is_utf = part->is_utf;
ex = part->urls_offset;
if (is_twopart && cur->next == NULL) {
- /* Compare part's content */
- if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+ /*
+ * Compare part's content
+ * Note: here we don't have filters proceeded this message, so using pool variable is a bad idea
+ */
+ p1 = cur->prev->data;
+ p2 = part;
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
+ if (diff >= COMMON_PART_FACTOR) {
msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
break;
}