From f579fed1dee306a5cf58360589d29ae1514d25b4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 30 Mar 2009 19:35:57 +0400 Subject: [PATCH] * Add 3 functions: - is_recipients_sorted - has_only_html_part - compare_recipients_distance * Update documentation * Fix build --- README.utf8.txt | 7 +++ src/cfg_file.l | 5 +- src/cfg_file.y | 6 +- src/expressions.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++ src/fuzzy.c | 2 +- src/fuzzy.h | 2 + src/main.h | 1 + src/message.c | 5 ++ 8 files changed, 167 insertions(+), 5 deletions(-) diff --git a/README.utf8.txt b/README.utf8.txt index 3ea18664a..d328efa7e 100644 --- a/README.utf8.txt +++ b/README.utf8.txt @@ -147,3 +147,10 @@ SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})" * content_type_is_type - сравнивает тип content-type с регулярным выражением или строкой content_type_is_type(text) content_type_is_subtype(/?.html/) + * regexp_match_number - принимает в качестве первого параметра число, которое означает порог сработавших регэкспов и + список регэкспов или функций, которые должны проверяться. Если число сработавших регэкспов или функций больше порога, + функция возвращает TRUE, иначе - FALSE, например: + regexp_match_number(2, ${__RE1}, ${__RE2}, header_exists(Subject)) + * has_only_html_part - функция возвращает TRUE, если в сообщении есть только одна HTML часть + * compare_recipients_distance - вычисляет процент схожих получателей письма. Принимает аргумент - порог в процентах похожести. + * is_recipients_sorted - возвращает TRUE, если список получателей сортирован (работает только если число получателей >= 5). diff --git a/src/cfg_file.l b/src/cfg_file.l index 3224b4fde..8d347409f 100644 --- a/src/cfg_file.l +++ b/src/cfg_file.l @@ -11,7 +11,7 @@ #ifdef WITH_LUA #include "lua.h" #else -#define add_luabuf(x) yyerror ("lua support diabled"); YYERROR +#define add_luabuf(x) yyerror ("lua support diabled") #endif #define MAX_INCLUDE_DEPTH 10 @@ -157,6 +157,9 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG; \$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE; \".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING; +\n /* ignore EOL */; +[ \t]+ /* ignore whitespace */; +[ \t]*#.* /* ignore comments */; ^.endlua$ BEGIN(INITIAL); .* add_luabuf(yytext); diff --git a/src/cfg_file.y b/src/cfg_file.y index b62b6b5b4..92b695d55 100644 --- a/src/cfg_file.y +++ b/src/cfg_file.y @@ -14,8 +14,6 @@ #include "perl.h" #endif -#define YYDEBUG 1 - extern struct config_file *cfg; extern int yylineno; extern char *yytext; @@ -330,8 +328,10 @@ metricfunction: cur_metric->func_name = memory_pool_strdup (cfg->cfg_pool, $3); #ifdef WITH_LUA cur_metric->func = lua_consolidation_func; -#else +#elif !defined(WITHOUT_PERL) cur_metric->func = perl_consolidation_func; +#else + yyerror ("yyparse: rspamd is not compiled with perl or lua, so it is not possible to use custom consolidation functions"); #endif } ; diff --git a/src/expressions.c b/src/expressions.c index 957c43c5c..3fff36fbd 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -37,6 +37,9 @@ gboolean rspamd_content_type_has_param (struct worker_task *task, GList *args); gboolean rspamd_content_type_is_subtype (struct worker_task *task, GList *args); gboolean rspamd_content_type_is_type (struct worker_task *task, GList *args); gboolean rspamd_parts_distance (struct worker_task *task, GList *args); +gboolean rspamd_recipients_distance (struct worker_task *task, GList *args); +gboolean rspamd_has_only_html_part (struct worker_task *task, GList *args); +gboolean rspamd_is_recipients_sorted (struct worker_task *task, GList *args); /* * List of internal functions of rspamd @@ -48,11 +51,14 @@ static struct _fl { } rspamd_functions_list[] = { { "compare_encoding", rspamd_compare_encoding }, { "compare_parts_distance", rspamd_parts_distance }, + { "compare_recipients_distance", rspamd_recipients_distance }, { "content_type_compare_param", rspamd_content_type_compare_param }, { "content_type_has_param", rspamd_content_type_has_param }, { "content_type_is_subtype", rspamd_content_type_is_subtype }, { "content_type_is_type", rspamd_content_type_is_type }, + { "has_only_html_part", rspamd_has_only_html_part }, { "header_exists", rspamd_header_exists }, + { "is_recipients_sorted", rspamd_is_recipients_sorted }, }; static struct _fl *list_ptr = &rspamd_functions_list[0]; @@ -916,6 +922,144 @@ rspamd_content_type_is_type (struct worker_task *task, GList *args) return FALSE; } +struct addr_list { + const char *name; + const char *addr; +}; + +#define COMPARE_RCPT_LEN 3 +#define MIN_RCPT_TO_COMPARE 5 + +gboolean +rspamd_recipients_distance (struct worker_task *task, GList *args) +{ + struct expression_argument *arg; + InternetAddressList *cur; + InternetAddress *addr; + double threshold; + struct addr_list *ar; + int num, i, j, hits = 0, total = 0; + + if (args == NULL) { + msg_warn ("rspamd_content_type_compare_param: no parameters to function"); + return FALSE; + } + + arg = args->data; + threshold = strtod ((char *)arg->data, NULL); + + num = internet_address_list_length (task->rcpts); + if (num < MIN_RCPT_TO_COMPARE) { + return FALSE; + } + ar = memory_pool_alloc (task->task_pool, num * sizeof (struct addr_list)); + + /* Fill array */ + cur = task->rcpts; + i = 0; + while (cur) { + addr = internet_address_list_get_address (cur); + ar[i].name = internet_address_get_name (addr); + ar[i].addr = internet_address_get_addr (addr); + cur = internet_address_list_next (cur); + } + + /* Cycle all elements in array */ + for (i = 0; i < num; i ++) { + for (j = i + 1; j < num; j ++) { + if (g_ascii_strncasecmp (ar[i].name, ar[j].name, COMPARE_RCPT_LEN) == 0) { + hits ++; + } + if (g_ascii_strcasecmp (ar[i].addr, ar[j].addr) == 0) { + hits ++; + } + total ++; + } + } + + if ((double)total / (double)hits >= threshold) { + return TRUE; + } + + return FALSE; +} + +gboolean +rspamd_has_only_html_part (struct worker_task *task, GList *args) +{ + struct mime_text_part *p; + GList *cur; + gboolean res = FALSE; + + cur = g_list_first (task->text_parts); + while (cur) { + p = cur->data; + if (p->is_html) { + res = TRUE; + } + else { + res = FALSE; + break; + } + cur = g_list_next (cur); + } + + return res; +} + +static gboolean +is_recipient_list_sorted (const InternetAddressList *ia) +{ + const InternetAddressList *cur; + InternetAddress *addr; + gboolean res = TRUE; + struct addr_list current = {NULL, NULL}, previous = {NULL, NULL}; + + /* Do not check to short address lists */ + if (internet_address_list_length (ia) < MIN_RCPT_TO_COMPARE) { + return FALSE; + } + + cur = ia; + while (cur) { + addr = internet_address_list_get_address (cur); + current.name = internet_address_get_name (addr); + current.addr = internet_address_get_addr (addr); + if (previous.name != NULL) { + if (g_ascii_strcasecmp (current.name, previous.name) < 0) { + res = FALSE; + break; + } + if (g_ascii_strcasecmp (current.addr, previous.addr) < 0) { + res = FALSE; + break; + } + } + previous.name = current.name; + previous.addr = current.addr; + cur = internet_address_list_next (cur); + } + + return res; +} + +gboolean +rspamd_is_recipients_sorted (struct worker_task *task, GList *args) +{ + /* Check all types of addresses */ + if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_TO)) == TRUE) { + return TRUE; + } + if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_BCC)) == TRUE) { + return TRUE; + } + if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_CC)) == TRUE) { + return TRUE; + } + + return FALSE; +} + /* * vi:ts=4 */ diff --git a/src/fuzzy.c b/src/fuzzy.c index ecd1af64f..a53ed929d 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -107,7 +107,7 @@ fuzzy_update (fuzzy_hash_t *h, char c) * * Replace cost is normally 1, and 2 with nonzero xcost. */ -static uint32_t +uint32_t lev_distance (char *s1, int len1, char *s2, int len2) { int i; diff --git a/src/fuzzy.h b/src/fuzzy.h index 50d1a9110..4895e6c53 100644 --- a/src/fuzzy.h +++ b/src/fuzzy.h @@ -37,5 +37,7 @@ fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool); */ int fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2); +uint32_t lev_distance (char *s1, int len1, char *s2, int len2); + #endif diff --git a/src/main.h b/src/main.h index 431ed8f4b..ab785fa5a 100644 --- a/src/main.h +++ b/src/main.h @@ -173,6 +173,7 @@ struct worker_task { memcached_ctx_t *memc_ctx; /**< memcached context associated with task */ int parts_count; /**< mime parts count */ GMimeMessage *message; /**< message, parsed with GMime */ + InternetAddressList *rcpts; /**< list of all recipients */ GList *parts; /**< list of parsed parts */ GList *text_parts; /**< list of text parts */ char *raw_headers; /**< list of raw headers */ diff --git a/src/message.c b/src/message.c index c35659366..156bd1829 100644 --- a/src/message.c +++ b/src/message.c @@ -406,6 +406,11 @@ process_message (struct worker_task *task) memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, task->raw_headers); } + task->rcpts = g_mime_message_get_all_recipients (message); + if (task->rcpts) { + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts); + } + task->worker->srv->stat->messages_scanned ++; /* free the parser (and the stream) */ -- 2.39.5