Signed-off-by: Vsevolod Stakhov <vsevolod@highsecure.ru>tags/1.3.0
@@ -33,15 +33,24 @@ reconf['R_FLASH_REDIR_IMGSHACK'] = '/^(?:http:\\/\\/)?img\\d{1,5}\\.imageshack\\ | |||
-- Different text parts | |||
rspamd_config.R_PARTS_DIFFER = function(task) | |||
local distance = task:get_mempool():get_variable('parts_distance', 'int') | |||
local distance = task:get_mempool():get_variable('parts_distance', 'double') | |||
if distance then | |||
local nd = tonumber(distance) | |||
if nd < 50 then | |||
local score = 1 - util.tanh(nd / 100.0) | |||
task:insert_result('R_PARTS_DIFFER', score, tostring(nd) .. '%') | |||
-- ND is relation of different words to total words | |||
if nd >= 0.5 then | |||
local tw = task:get_mempool():get_variable('total_words', 'int') | |||
if tw then | |||
if tw > 30 then | |||
-- We are confident about difference | |||
local score = (nd - 0.5) * 2.0 | |||
else | |||
-- We are not so confident about difference | |||
local score = (nd - 0.5) | |||
end | |||
task:insert_result('R_PARTS_DIFFER', score, tostring(100.0 * nd) .. '%') | |||
end | |||
end | |||
end | |||
@@ -1574,8 +1574,9 @@ rspamd_message_parse (struct rspamd_task *task) | |||
const gchar *p; | |||
gsize len; | |||
goffset hdr_pos; | |||
gint diff, *pdiff, i; | |||
guint tw, dw; | |||
gint i; | |||
gdouble diff, *pdiff; | |||
guint tw, *ptw, dw; | |||
if (RSPAMD_TASK_IS_EMPTY (task)) { | |||
/* Don't do anything with empty task */ | |||
@@ -1847,26 +1848,34 @@ rspamd_message_parse (struct rspamd_task *task) | |||
if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) && | |||
p1->normalized_words && p2->normalized_words) { | |||
tw = MAX (p1->normalized_words->len, p2->normalized_words->len); | |||
tw = p1->normalized_words->len + p2->normalized_words->len; | |||
if (tw > 0) { | |||
dw = rspamd_words_levenshtein_distance (task, | |||
p1->normalized_words, | |||
p2->normalized_words); | |||
diff = (100.0 * (gdouble)(tw - dw) / (gdouble)tw); | |||
diff = (2.0 * (gdouble)dw) / (gdouble)tw; | |||
debug_task ( | |||
msg_err_task ( | |||
"different words: %d, total words: %d, " | |||
"got likeliness between parts of %d%%", | |||
"got diff between parts of %.2f", | |||
dw, tw, | |||
diff); | |||
pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint)); | |||
pdiff = rspamd_mempool_alloc (task->task_pool, | |||
sizeof (gdouble)); | |||
*pdiff = diff; | |||
rspamd_mempool_set_variable (task->task_pool, | |||
"parts_distance", | |||
pdiff, | |||
NULL); | |||
ptw = rspamd_mempool_alloc (task->task_pool, | |||
sizeof (gint)); | |||
*ptw = tw; | |||
rspamd_mempool_set_variable (task->task_pool, | |||
"total_words", | |||
ptw, | |||
NULL); | |||
} | |||
} | |||
} |
@@ -956,9 +956,9 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused) | |||
gboolean | |||
rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused) | |||
{ | |||
gint threshold, threshold2 = -1, diff; | |||
gint threshold, threshold2 = -1; | |||
struct expression_argument *arg; | |||
gint *pdiff; | |||
gdouble *pdiff, diff; | |||
if (args == NULL || args->len == 0) { | |||
debug_task ("no threshold is specified, assume it 100"); | |||
@@ -998,12 +998,13 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused) | |||
if ((pdiff = | |||
rspamd_mempool_get_variable (task->task_pool, | |||
"parts_distance")) != NULL) { | |||
diff = *pdiff; | |||
diff = (1.0 - (*pdiff)) * 100.0; | |||
if (diff != -1) { | |||
if (threshold2 > 0) { | |||
if (diff >= | |||
MIN (threshold, | |||
threshold2) && diff < MAX (threshold, threshold2)) { | |||
if (diff >= MIN (threshold, threshold2) && | |||
diff < MAX (threshold, threshold2)) { | |||
return TRUE; | |||
} | |||
} |
@@ -28,7 +28,7 @@ | |||
#define RSPAMD_LEARN_OP 1 | |||
#define RSPAMD_UNLEARN_OP 2 | |||
static const gint similarity_treshold = 80; | |||
static const gdouble similarity_treshold = 80.0; | |||
static void | |||
rspamd_stat_tokenize_header (struct rspamd_task *task, | |||
@@ -173,7 +173,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, | |||
GArray *words; | |||
gchar *sub; | |||
guint i, reserved_len = 0; | |||
gint *pdiff; | |||
gdouble *pdiff; | |||
for (i = 0; i < task->text_parts->len; i++) { | |||
part = g_ptr_array_index (task->text_parts, i); | |||
@@ -200,7 +200,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, | |||
} | |||
if (pdiff != NULL && *pdiff > similarity_treshold) { | |||
if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) { | |||
msg_debug_task ("message has two common parts (%d%%), so skip the last one", | |||
*pdiff); | |||
break; |