summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-29 12:59:46 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-07-29 12:59:46 +0100
commitdd4daf7946332b996f4b49880107f63db908f4d4 (patch)
treec1618580e2ccb888f08eaf1ed0ef882f538b6bc2 /src
parentc8dc14acc079a189f41eb2a30d0e4443c5abcce4 (diff)
downloadrspamd-dd4daf7946332b996f4b49880107f63db908f4d4.tar.gz
rspamd-dd4daf7946332b996f4b49880107f63db908f4d4.zip
Skip same text parts when processing statistics.
Diffstat (limited to 'src')
-rw-r--r--src/libstat/stat_process.c10
1 files changed, 9 insertions, 1 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index f216d964b..93f48b3e9 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -35,6 +35,8 @@
#define RSPAMD_LEARN_OP 1
#define RSPAMD_UNLEARN_OP 2
+static const gint similarity_treshold = 80;
+
struct preprocess_cb_data {
struct rspamd_task *task;
GList *classifier_runtimes;
@@ -187,9 +189,11 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
GArray *words;
gchar *sub;
guint i;
+ gint *pdiff;
gboolean compat;
compat = tok->tokenizer->is_compat (tok);
+ pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance");
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
@@ -205,7 +209,11 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
}
- /* TODO: compare parts distance */
+ if (pdiff != NULL && *pdiff > similarity_treshold) {
+ msg_debug ("message has two common parts (%d%%), so skip the last one",
+ *pdiff);
+ break;
+ }
}
if (task->subject != NULL) {