Przeglądaj źródła

[Fix] Fix and rescore R_PARTS_DIFFER logic

Signed-off-by: Vsevolod Stakhov <vsevolod@highsecure.ru>
tags/1.3.0
Vsevolod Stakhov 8 lat temu
rodzic
commit
1ed9f282a5
No account linked to committer's email address

+ 15
- 6
rules/misc.lua Wyświetl plik

@@ -33,15 +33,24 @@ reconf['R_FLASH_REDIR_IMGSHACK'] = '/^(?:http:\\/\\/)?img\\d{1,5}\\.imageshack\\

-- Different text parts
rspamd_config.R_PARTS_DIFFER = function(task)
local distance = task:get_mempool():get_variable('parts_distance', 'int')
local distance = task:get_mempool():get_variable('parts_distance', 'double')

if distance then
local nd = tonumber(distance)

if nd < 50 then
local score = 1 - util.tanh(nd / 100.0)

task:insert_result('R_PARTS_DIFFER', score, tostring(nd) .. '%')
-- ND is relation of different words to total words
if nd >= 0.5 then
local tw = task:get_mempool():get_variable('total_words', 'int')

if tw then
if tw > 30 then
-- We are confident about difference
local score = (nd - 0.5) * 2.0
else
-- We are not so confident about difference
local score = (nd - 0.5)
end
task:insert_result('R_PARTS_DIFFER', score, tostring(100.0 * nd) .. '%')
end
end
end


+ 16
- 7
src/libmime/message.c Wyświetl plik

@@ -1574,8 +1574,9 @@ rspamd_message_parse (struct rspamd_task *task)
const gchar *p;
gsize len;
goffset hdr_pos;
gint diff, *pdiff, i;
guint tw, dw;
gint i;
gdouble diff, *pdiff;
guint tw, *ptw, dw;

if (RSPAMD_TASK_IS_EMPTY (task)) {
/* Don't do anything with empty task */
@@ -1847,26 +1848,34 @@ rspamd_message_parse (struct rspamd_task *task)
if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
p1->normalized_words && p2->normalized_words) {

tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
tw = p1->normalized_words->len + p2->normalized_words->len;

if (tw > 0) {
dw = rspamd_words_levenshtein_distance (task,
p1->normalized_words,
p2->normalized_words);
diff = (100.0 * (gdouble)(tw - dw) / (gdouble)tw);
diff = (2.0 * (gdouble)dw) / (gdouble)tw;

debug_task (
msg_err_task (
"different words: %d, total words: %d, "
"got likeliness between parts of %d%%",
"got diff between parts of %.2f",
dw, tw,
diff);

pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
pdiff = rspamd_mempool_alloc (task->task_pool,
sizeof (gdouble));
*pdiff = diff;
rspamd_mempool_set_variable (task->task_pool,
"parts_distance",
pdiff,
NULL);
ptw = rspamd_mempool_alloc (task->task_pool,
sizeof (gint));
*ptw = tw;
rspamd_mempool_set_variable (task->task_pool,
"total_words",
ptw,
NULL);
}
}
}

+ 7
- 6
src/libmime/mime_expressions.c Wyświetl plik

@@ -956,9 +956,9 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused)
gboolean
rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
{
gint threshold, threshold2 = -1, diff;
gint threshold, threshold2 = -1;
struct expression_argument *arg;
gint *pdiff;
gdouble *pdiff, diff;

if (args == NULL || args->len == 0) {
debug_task ("no threshold is specified, assume it 100");
@@ -998,12 +998,13 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
if ((pdiff =
rspamd_mempool_get_variable (task->task_pool,
"parts_distance")) != NULL) {
diff = *pdiff;
diff = (1.0 - (*pdiff)) * 100.0;

if (diff != -1) {
if (threshold2 > 0) {
if (diff >=
MIN (threshold,
threshold2) && diff < MAX (threshold, threshold2)) {
if (diff >= MIN (threshold, threshold2) &&
diff < MAX (threshold, threshold2)) {
return TRUE;
}
}

+ 3
- 3
src/libstat/stat_process.c Wyświetl plik

@@ -28,7 +28,7 @@
#define RSPAMD_LEARN_OP 1
#define RSPAMD_UNLEARN_OP 2

static const gint similarity_treshold = 80;
static const gdouble similarity_treshold = 80.0;

static void
rspamd_stat_tokenize_header (struct rspamd_task *task,
@@ -173,7 +173,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
GArray *words;
gchar *sub;
guint i, reserved_len = 0;
gint *pdiff;
gdouble *pdiff;

for (i = 0; i < task->text_parts->len; i++) {
part = g_ptr_array_index (task->text_parts, i);
@@ -200,7 +200,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
}


if (pdiff != NULL && *pdiff > similarity_treshold) {
if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
msg_debug_task ("message has two common parts (%d%%), so skip the last one",
*pdiff);
break;

Ładowanie…
Anuluj
Zapisz