aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-03-30 19:35:57 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-03-30 19:35:57 +0400
commitf579fed1dee306a5cf58360589d29ae1514d25b4 (patch)
treef5d5e288553d30fc617d06bf9e48ae919092f876
parentdd2fbb7a5b7e1c0d844900147486ad7ffd98c1e4 (diff)
downloadrspamd-f579fed1dee306a5cf58360589d29ae1514d25b4.tar.gz
rspamd-f579fed1dee306a5cf58360589d29ae1514d25b4.zip
* Add 3 functions:
- is_recipients_sorted - has_only_html_part - compare_recipients_distance * Update documentation * Fix build
-rw-r--r--README.utf8.txt7
-rw-r--r--src/cfg_file.l5
-rw-r--r--src/cfg_file.y6
-rw-r--r--src/expressions.c144
-rw-r--r--src/fuzzy.c2
-rw-r--r--src/fuzzy.h2
-rw-r--r--src/main.h1
-rw-r--r--src/message.c5
8 files changed, 167 insertions, 5 deletions
diff --git a/README.utf8.txt b/README.utf8.txt
index 3ea18664a..d328efa7e 100644
--- a/README.utf8.txt
+++ b/README.utf8.txt
@@ -147,3 +147,10 @@ SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})"
* content_type_is_type - сравнивает тип content-type с регулярным выражением или строкой
content_type_is_type(text)
content_type_is_subtype(/?.html/)
+ * regexp_match_number - принимает в качестве первого параметра число, которое означает порог сработавших регэкспов и
+ список регэкспов или функций, которые должны проверяться. Если число сработавших регэкспов или функций больше порога,
+ функция возвращает TRUE, иначе - FALSE, например:
+ regexp_match_number(2, ${__RE1}, ${__RE2}, header_exists(Subject))
+ * has_only_html_part - функция возвращает TRUE, если в сообщении есть только одна HTML часть
+ * compare_recipients_distance - вычисляет процент схожих получателей письма. Принимает аргумент - порог в процентах похожести.
+ * is_recipients_sorted - возвращает TRUE, если список получателей сортирован (работает только если число получателей >= 5).
diff --git a/src/cfg_file.l b/src/cfg_file.l
index 3224b4fde..8d347409f 100644
--- a/src/cfg_file.l
+++ b/src/cfg_file.l
@@ -11,7 +11,7 @@
#ifdef WITH_LUA
#include "lua.h"
#else
-#define add_luabuf(x) yyerror ("lua support diabled"); YYERROR
+#define add_luabuf(x) yyerror ("lua support diabled")
#endif
#define MAX_INCLUDE_DEPTH 10
@@ -157,6 +157,9 @@ yes|YES|no|NO|[yY]|[nN] yylval.flag=parse_flag(yytext); return FLAG;
<module>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
<module>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING;
+<lua>\n /* ignore EOL */;
+<lua>[ \t]+ /* ignore whitespace */;
+<lua>[ \t]*#.* /* ignore comments */;
<lua>^.endlua$ BEGIN(INITIAL);
<lua>.* add_luabuf(yytext);
diff --git a/src/cfg_file.y b/src/cfg_file.y
index b62b6b5b4..92b695d55 100644
--- a/src/cfg_file.y
+++ b/src/cfg_file.y
@@ -14,8 +14,6 @@
#include "perl.h"
#endif
-#define YYDEBUG 1
-
extern struct config_file *cfg;
extern int yylineno;
extern char *yytext;
@@ -330,8 +328,10 @@ metricfunction:
cur_metric->func_name = memory_pool_strdup (cfg->cfg_pool, $3);
#ifdef WITH_LUA
cur_metric->func = lua_consolidation_func;
-#else
+#elif !defined(WITHOUT_PERL)
cur_metric->func = perl_consolidation_func;
+#else
+ yyerror ("yyparse: rspamd is not compiled with perl or lua, so it is not possible to use custom consolidation functions");
#endif
}
;
diff --git a/src/expressions.c b/src/expressions.c
index 957c43c5c..3fff36fbd 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -37,6 +37,9 @@ gboolean rspamd_content_type_has_param (struct worker_task *task, GList *args);
gboolean rspamd_content_type_is_subtype (struct worker_task *task, GList *args);
gboolean rspamd_content_type_is_type (struct worker_task *task, GList *args);
gboolean rspamd_parts_distance (struct worker_task *task, GList *args);
+gboolean rspamd_recipients_distance (struct worker_task *task, GList *args);
+gboolean rspamd_has_only_html_part (struct worker_task *task, GList *args);
+gboolean rspamd_is_recipients_sorted (struct worker_task *task, GList *args);
/*
* List of internal functions of rspamd
@@ -48,11 +51,14 @@ static struct _fl {
} rspamd_functions_list[] = {
{ "compare_encoding", rspamd_compare_encoding },
{ "compare_parts_distance", rspamd_parts_distance },
+ { "compare_recipients_distance", rspamd_recipients_distance },
{ "content_type_compare_param", rspamd_content_type_compare_param },
{ "content_type_has_param", rspamd_content_type_has_param },
{ "content_type_is_subtype", rspamd_content_type_is_subtype },
{ "content_type_is_type", rspamd_content_type_is_type },
+ { "has_only_html_part", rspamd_has_only_html_part },
{ "header_exists", rspamd_header_exists },
+ { "is_recipients_sorted", rspamd_is_recipients_sorted },
};
static struct _fl *list_ptr = &rspamd_functions_list[0];
@@ -916,6 +922,144 @@ rspamd_content_type_is_type (struct worker_task *task, GList *args)
return FALSE;
}
+struct addr_list {
+ const char *name;
+ const char *addr;
+};
+
+#define COMPARE_RCPT_LEN 3
+#define MIN_RCPT_TO_COMPARE 5
+
+gboolean
+rspamd_recipients_distance (struct worker_task *task, GList *args)
+{
+ struct expression_argument *arg;
+ InternetAddressList *cur;
+ InternetAddress *addr;
+ double threshold;
+ struct addr_list *ar;
+ int num, i, j, hits = 0, total = 0;
+
+ if (args == NULL) {
+ msg_warn ("rspamd_content_type_compare_param: no parameters to function");
+ return FALSE;
+ }
+
+ arg = args->data;
+ threshold = strtod ((char *)arg->data, NULL);
+
+ num = internet_address_list_length (task->rcpts);
+ if (num < MIN_RCPT_TO_COMPARE) {
+ return FALSE;
+ }
+ ar = memory_pool_alloc (task->task_pool, num * sizeof (struct addr_list));
+
+ /* Fill array */
+ cur = task->rcpts;
+ i = 0;
+ while (cur) {
+ addr = internet_address_list_get_address (cur);
+ ar[i].name = internet_address_get_name (addr);
+ ar[i].addr = internet_address_get_addr (addr);
+ cur = internet_address_list_next (cur);
+ }
+
+ /* Cycle all elements in array */
+ for (i = 0; i < num; i ++) {
+ for (j = i + 1; j < num; j ++) {
+ if (g_ascii_strncasecmp (ar[i].name, ar[j].name, COMPARE_RCPT_LEN) == 0) {
+ hits ++;
+ }
+ if (g_ascii_strcasecmp (ar[i].addr, ar[j].addr) == 0) {
+ hits ++;
+ }
+ total ++;
+ }
+ }
+
+ if ((double)total / (double)hits >= threshold) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_has_only_html_part (struct worker_task *task, GList *args)
+{
+ struct mime_text_part *p;
+ GList *cur;
+ gboolean res = FALSE;
+
+ cur = g_list_first (task->text_parts);
+ while (cur) {
+ p = cur->data;
+ if (p->is_html) {
+ res = TRUE;
+ }
+ else {
+ res = FALSE;
+ break;
+ }
+ cur = g_list_next (cur);
+ }
+
+ return res;
+}
+
+static gboolean
+is_recipient_list_sorted (const InternetAddressList *ia)
+{
+ const InternetAddressList *cur;
+ InternetAddress *addr;
+ gboolean res = TRUE;
+ struct addr_list current = {NULL, NULL}, previous = {NULL, NULL};
+
+ /* Do not check to short address lists */
+ if (internet_address_list_length (ia) < MIN_RCPT_TO_COMPARE) {
+ return FALSE;
+ }
+
+ cur = ia;
+ while (cur) {
+ addr = internet_address_list_get_address (cur);
+ current.name = internet_address_get_name (addr);
+ current.addr = internet_address_get_addr (addr);
+ if (previous.name != NULL) {
+ if (g_ascii_strcasecmp (current.name, previous.name) < 0) {
+ res = FALSE;
+ break;
+ }
+ if (g_ascii_strcasecmp (current.addr, previous.addr) < 0) {
+ res = FALSE;
+ break;
+ }
+ }
+ previous.name = current.name;
+ previous.addr = current.addr;
+ cur = internet_address_list_next (cur);
+ }
+
+ return res;
+}
+
+gboolean
+rspamd_is_recipients_sorted (struct worker_task *task, GList *args)
+{
+ /* Check all types of addresses */
+ if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_TO)) == TRUE) {
+ return TRUE;
+ }
+ if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_BCC)) == TRUE) {
+ return TRUE;
+ }
+ if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_CC)) == TRUE) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
/*
* vi:ts=4
*/
diff --git a/src/fuzzy.c b/src/fuzzy.c
index ecd1af64f..a53ed929d 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -107,7 +107,7 @@ fuzzy_update (fuzzy_hash_t *h, char c)
*
* Replace cost is normally 1, and 2 with nonzero xcost.
*/
-static uint32_t
+uint32_t
lev_distance (char *s1, int len1, char *s2, int len2)
{
int i;
diff --git a/src/fuzzy.h b/src/fuzzy.h
index 50d1a9110..4895e6c53 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -37,5 +37,7 @@ fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
*/
int fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2);
+uint32_t lev_distance (char *s1, int len1, char *s2, int len2);
+
#endif
diff --git a/src/main.h b/src/main.h
index 431ed8f4b..ab785fa5a 100644
--- a/src/main.h
+++ b/src/main.h
@@ -173,6 +173,7 @@ struct worker_task {
memcached_ctx_t *memc_ctx; /**< memcached context associated with task */
int parts_count; /**< mime parts count */
GMimeMessage *message; /**< message, parsed with GMime */
+ InternetAddressList *rcpts; /**< list of all recipients */
GList *parts; /**< list of parsed parts */
GList *text_parts; /**< list of text parts */
char *raw_headers; /**< list of raw headers */
diff --git a/src/message.c b/src/message.c
index c35659366..156bd1829 100644
--- a/src/message.c
+++ b/src/message.c
@@ -406,6 +406,11 @@ process_message (struct worker_task *task)
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, task->raw_headers);
}
+ task->rcpts = g_mime_message_get_all_recipients (message);
+ if (task->rcpts) {
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts);
+ }
+
task->worker->srv->stat->messages_scanned ++;
/* free the parser (and the stream) */