diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-03-23 14:10:07 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-03-23 14:10:07 +0300 |
commit | c79b5ccd22cbc1c273479f4f88189a18effda533 (patch) | |
tree | 1741743779a70146a61cd1767936aa43d671e36b /src | |
parent | afdaddc4d0745a5bcefad73dd74fd4c03ae3de15 (diff) | |
download | rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.tar.gz rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.zip |
* Fix error in expression parser that causes bad errors with expressions that have regexp at the end
* Improve test for fuzzy hashes
* Add new object - TextPart to perl XS library that allows access to stripped parts and fuzzy hashes
* Add documentation for expressions parser and fot Mail::Rspamd::TextPart
* Allways calculate fuzzy hash for text parts
* Store text parts separately from other parts
* Add compare_parts_distance for expressions that calculates difference in 2 parts messages
* Do not try to substitute variables in empty strings
Diffstat (limited to 'src')
-rw-r--r-- | src/cfg_utils.c | 5 | ||||
-rw-r--r-- | src/expressions.c | 64 | ||||
-rw-r--r-- | src/filter.c | 12 | ||||
-rw-r--r-- | src/fuzzy.c | 13 | ||||
-rw-r--r-- | src/fuzzy.h | 1 | ||||
-rw-r--r-- | src/main.h | 1 | ||||
-rw-r--r-- | src/message.c | 19 | ||||
-rw-r--r-- | src/message.h | 8 |
8 files changed, 110 insertions, 13 deletions
diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 1eeb518ed..037f23754 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -355,6 +355,11 @@ substitute_variable (struct config_file *cfg, char *str, u_char recursive) char *var, *new, *v_begin, *v_end; size_t len; + if (str == NULL) { + yywarn ("substitute_variable: trying to substitute variable in NULL string"); + return NULL; + } + while ((v_begin = strstr (str, "${")) != NULL) { len = strlen (str); *v_begin = '\0'; diff --git a/src/expressions.c b/src/expressions.c index 5cb30e4c3..eefd11f78 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -26,12 +26,15 @@ #include "util.h" #include "cfg_file.h" #include "main.h" +#include "message.h" +#include "fuzzy.h" #include "expressions.h" typedef gboolean (*rspamd_internal_func_t)(struct worker_task *, GList *args); gboolean rspamd_compare_encoding (struct worker_task *task, GList *args); gboolean rspamd_header_exists (struct worker_task *task, GList *args); +gboolean rspamd_parts_distance (struct worker_task *task, GList *args); /* * List of internal functions of rspamd * Sorted by name to use bsearch @@ -41,6 +44,7 @@ static struct _fl { rspamd_internal_func_t func; } rspamd_functions_list[] = { { "compare_encoding", rspamd_compare_encoding }, + { "compare_parts_distance", rspamd_parts_distance }, { "header_exists", rspamd_header_exists }, }; @@ -273,7 +277,9 @@ parse_expression (memory_pool_t *pool, char *line) case READ_REGEXP: if (*p == '/' && *(p - 1) != '\\') { - p ++; + if (*(p + 1)) { + p ++; + } state = READ_REGEXP_FLAGS; } else { @@ -285,14 +291,17 @@ parse_expression (memory_pool_t *pool, char *line) if (!is_regexp_flag (*p) || *(p + 1) == '\0') { if (c != p) { /* Copy operand */ - str = memory_pool_alloc (pool, p - c + 3); - g_strlcpy (str, c - 1, (p - c + 3)); + if (*(p + 1) == '\0') { + p++; + } + str = memory_pool_alloc (pool, p - c + 2); + g_strlcpy (str, c - 1, (p - c + 2)); g_strstrip (str); if (strlen (str) > 0) { insert_expression (pool, &expr, EXPR_REGEXP, 0, str); } } - c = ++p; + c = p; state = SKIP_SPACES; } else { @@ -594,5 +603,52 @@ rspamd_header_exists (struct worker_task *task, GList *args) } /* + * This function is designed to find difference between text/html and text/plain parts + * It takes one argument: difference threshold, if we have two text parts, compare + * its hashes and check for threshold, if value is greater than threshold, return TRUE + * and return FALSE otherwise. + */ +gboolean +rspamd_parts_distance (struct worker_task *task, GList *args) +{ + int threshold; + struct mime_text_part *p1, *p2; + GList *cur; + + if (args == NULL) { + msg_debug ("rspamd_parts_distance: no threshold is specified, assume it 100"); + threshold = 100; + } + else { + errno = 0; + threshold = strtoul ((char *)args->data, NULL, 10); + if (errno != 0) { + msg_info ("rspamd_parts_distance: bad numeric value for threshold \"%s\", assume it 100", (char *)args->data); + threshold = 100; + } + } + + if (g_list_length (task->text_parts) == 2) { + cur = g_list_first (task->text_parts); + p1 = cur->data; + cur = g_list_next (cur); + if (cur == NULL) { + msg_info ("rspamd_parts_distance: bad parts list"); + return FALSE; + } + p2 = cur->data; + if (fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy) >= threshold) { + return TRUE; + } + } + else { + msg_debug ("rspamd_parts_distance: message has too many text parts, so do not try to compare them with each other"); + return FALSE; + } + + return FALSE; +} + +/* * vi:ts=4 */ diff --git a/src/filter.c b/src/filter.c index 766cd16e4..1b6cdc1b0 100644 --- a/src/filter.c +++ b/src/filter.c @@ -437,10 +437,10 @@ statfiles_callback (gpointer key, gpointer value, void *arg) struct classifier *classifier; struct statfile_result_data *res_data; struct metric *metric; + struct mime_text_part *text_part; GTree *tokens = NULL; - GList *cur = NULL; - GByteArray *content; + GList *cur; char *filename; f_str_t c; @@ -457,10 +457,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg) return; } + cur = g_list_first (task->text_parts); if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) { - while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) { - c.begin = content->data; - c.len = content->len; + while (cur != NULL) { + text_part = (struct mime_text_part *)cur->data; + c.begin = text_part->content->data; + c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) { msg_info ("statfiles_callback: cannot tokenize input"); diff --git a/src/fuzzy.c b/src/fuzzy.c index 08814eaa1..4dfec3fb7 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -95,7 +95,7 @@ fuzzy_update (fuzzy_hash_t *h, char c) if (h->rh % h->block_size == (h->block_size - 1)) { h->hash_pipe[h->hi] = h->h; - if (h->hi < FUZZY_HASHLEN - 1) { + if (h->hi < FUZZY_HASHLEN - 2) { h->h = HASH_INIT; h->hi ++; } @@ -249,6 +249,17 @@ fuzzy_init (f_str_t *in, memory_pool_t *pool) return new; } +fuzzy_hash_t * +fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool) +{ + f_str_t f; + + f.begin = in->data; + f.len = in->len; + + return fuzzy_init (&f, pool); +} + /* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ int fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2) diff --git a/src/fuzzy.h b/src/fuzzy.h index 91e6512c6..50d1a9110 100644 --- a/src/fuzzy.h +++ b/src/fuzzy.h @@ -27,6 +27,7 @@ typedef struct fuzzy_hash_s { * @return fuzzy_hash object allocated in pool */ fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool); +fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool); /** * Compare score of difference between two hashes diff --git a/src/main.h b/src/main.h index 28eb64297..e26ab3fda 100644 --- a/src/main.h +++ b/src/main.h @@ -174,6 +174,7 @@ struct worker_task { int parts_count; /**< mime parts count */ GMimeMessage *message; /**< message, parsed with GMime */ GList *parts; /**< list of parsed parts */ + GList *text_parts; /**< list of text parts */ char *raw_headers; /**< list of raw headers */ TAILQ_HEAD (uriq, uri) urls; /**< list of parsed urls */ GHashTable *results; /**< hash table of metric_result indexed by diff --git a/src/message.c b/src/message.c index 76743f7de..807463a82 100644 --- a/src/message.c +++ b/src/message.c @@ -242,6 +242,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data) { struct worker_task *task = (struct worker_task *)user_data; struct mime_part *mime_part; + struct mime_text_part *text_part; GMimeContentType *type; GMimeDataWrapper *wrapper; GMimeStream *part_stream; @@ -302,13 +303,27 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data) mime_part->content = part_content; msg_debug ("mime_foreach_callback: found part with content-type: %s/%s", type->type, type->subtype); task->parts = g_list_prepend (task->parts, mime_part); - if (g_mime_content_type_is_type (type, "text", "html")) { + /* Now do special processing for text parts of message */ + if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { msg_debug ("mime_foreach_callback: got urls from text/html part"); url_parse_html (task, part_content); + + text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); + text_part->content = strip_html_tags (part_content, NULL); + text_part->is_html = TRUE; + text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); + task->text_parts = g_list_prepend (task->text_parts, text_part); } else if (g_mime_content_type_is_type (type, "text", "plain")) { - url_parse_text (task, part_content); msg_debug ("mime_foreach_callback: got urls from text/plain part"); + url_parse_text (task, part_content); + + text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); + text_part->content = part_content; + text_part->is_html = FALSE; + text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + task->text_parts = g_list_prepend (task->text_parts, text_part); } } else { diff --git a/src/message.h b/src/message.h index 1122e7e3b..eaf9f493e 100644 --- a/src/message.h +++ b/src/message.h @@ -7,11 +7,17 @@ #define RSPAMD_MESSAGE_H #include "config.h" +#include "fuzzy.h" struct mime_part { GMimeContentType *type; GByteArray *content; - TAILQ_ENTRY (mime_part) next; +}; + +struct mime_text_part { + gboolean is_html; + GByteArray *content; + fuzzy_hash_t *fuzzy; }; /** |