diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-29 12:59:46 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-29 12:59:46 +0100 |
commit | dd4daf7946332b996f4b49880107f63db908f4d4 (patch) | |
tree | c1618580e2ccb888f08eaf1ed0ef882f538b6bc2 /src | |
parent | c8dc14acc079a189f41eb2a30d0e4443c5abcce4 (diff) | |
download | rspamd-dd4daf7946332b996f4b49880107f63db908f4d4.tar.gz rspamd-dd4daf7946332b996f4b49880107f63db908f4d4.zip |
Skip same text parts when processing statistics.
Diffstat (limited to 'src')
-rw-r--r-- | src/libstat/stat_process.c | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index f216d964b..93f48b3e9 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -35,6 +35,8 @@ #define RSPAMD_LEARN_OP 1 #define RSPAMD_UNLEARN_OP 2 +static const gint similarity_treshold = 80; + struct preprocess_cb_data { struct rspamd_task *task; GList *classifier_runtimes; @@ -187,9 +189,11 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, GArray *words; gchar *sub; guint i; + gint *pdiff; gboolean compat; compat = tok->tokenizer->is_compat (tok); + pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance"); for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); @@ -205,7 +209,11 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } } - /* TODO: compare parts distance */ + if (pdiff != NULL && *pdiff > similarity_treshold) { + msg_debug ("message has two common parts (%d%%), so skip the last one", + *pdiff); + break; + } } if (task->subject != NULL) { |