aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-04-27 16:05:15 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-04-27 16:05:15 +0100
commit1ed9f282a568ef64372f687dba5ca25033b0ce2b (patch)
treeaf29263bf46da3c936cb1c9c0aa4743f9e736438 /src
parent781af3209968da2b8d7a4b1e38d4a1473c8a9852 (diff)
downloadrspamd-1ed9f282a568ef64372f687dba5ca25033b0ce2b.tar.gz
rspamd-1ed9f282a568ef64372f687dba5ca25033b0ce2b.zip
[Fix] Fix and rescore R_PARTS_DIFFER logic
Signed-off-by: Vsevolod Stakhov <vsevolod@highsecure.ru>
Diffstat (limited to 'src')
-rw-r--r--src/libmime/message.c23
-rw-r--r--src/libmime/mime_expressions.c13
-rw-r--r--src/libstat/stat_process.c6
3 files changed, 26 insertions, 16 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 2f3656eb8..14dd24a7f 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1574,8 +1574,9 @@ rspamd_message_parse (struct rspamd_task *task)
const gchar *p;
gsize len;
goffset hdr_pos;
- gint diff, *pdiff, i;
- guint tw, dw;
+ gint i;
+ gdouble diff, *pdiff;
+ guint tw, *ptw, dw;
if (RSPAMD_TASK_IS_EMPTY (task)) {
/* Don't do anything with empty task */
@@ -1847,26 +1848,34 @@ rspamd_message_parse (struct rspamd_task *task)
if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
p1->normalized_words && p2->normalized_words) {
- tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
+ tw = p1->normalized_words->len + p2->normalized_words->len;
if (tw > 0) {
dw = rspamd_words_levenshtein_distance (task,
p1->normalized_words,
p2->normalized_words);
- diff = (100.0 * (gdouble)(tw - dw) / (gdouble)tw);
+ diff = (2.0 * (gdouble)dw) / (gdouble)tw;
- debug_task (
+ msg_err_task (
"different words: %d, total words: %d, "
- "got likeliness between parts of %d%%",
+ "got diff between parts of %.2f",
dw, tw,
diff);
- pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
+ pdiff = rspamd_mempool_alloc (task->task_pool,
+ sizeof (gdouble));
*pdiff = diff;
rspamd_mempool_set_variable (task->task_pool,
"parts_distance",
pdiff,
NULL);
+ ptw = rspamd_mempool_alloc (task->task_pool,
+ sizeof (gint));
+ *ptw = tw;
+ rspamd_mempool_set_variable (task->task_pool,
+ "total_words",
+ ptw,
+ NULL);
}
}
}
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index fcea95bb2..c73b1c17c 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -956,9 +956,9 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused)
gboolean
rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
{
- gint threshold, threshold2 = -1, diff;
+ gint threshold, threshold2 = -1;
struct expression_argument *arg;
- gint *pdiff;
+ gdouble *pdiff, diff;
if (args == NULL || args->len == 0) {
debug_task ("no threshold is specified, assume it 100");
@@ -998,12 +998,13 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
if ((pdiff =
rspamd_mempool_get_variable (task->task_pool,
"parts_distance")) != NULL) {
- diff = *pdiff;
+ diff = (1.0 - (*pdiff)) * 100.0;
+
if (diff != -1) {
if (threshold2 > 0) {
- if (diff >=
- MIN (threshold,
- threshold2) && diff < MAX (threshold, threshold2)) {
+ if (diff >= MIN (threshold, threshold2) &&
+ diff < MAX (threshold, threshold2)) {
+
return TRUE;
}
}
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index e6d34e406..486d82c08 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -28,7 +28,7 @@
#define RSPAMD_LEARN_OP 1
#define RSPAMD_UNLEARN_OP 2
-static const gint similarity_treshold = 80;
+static const gdouble similarity_treshold = 80.0;
static void
rspamd_stat_tokenize_header (struct rspamd_task *task,
@@ -173,7 +173,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
GArray *words;
gchar *sub;
guint i, reserved_len = 0;
- gint *pdiff;
+ gdouble *pdiff;
for (i = 0; i < task->text_parts->len; i++) {
part = g_ptr_array_index (task->text_parts, i);
@@ -200,7 +200,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}
- if (pdiff != NULL && *pdiff > similarity_treshold) {
+ if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
msg_debug_task ("message has two common parts (%d%%), so skip the last one",
*pdiff);
break;