diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-03-23 14:10:07 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-03-23 14:10:07 +0300 |
commit | c79b5ccd22cbc1c273479f4f88189a18effda533 (patch) | |
tree | 1741743779a70146a61cd1767936aa43d671e36b | |
parent | afdaddc4d0745a5bcefad73dd74fd4c03ae3de15 (diff) | |
download | rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.tar.gz rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.zip |
* Fix error in expression parser that causes bad errors with expressions that have regexp at the end
* Improve test for fuzzy hashes
* Add new object - TextPart to perl XS library that allows access to stripped parts and fuzzy hashes
* Add documentation for expressions parser and fot Mail::Rspamd::TextPart
* Allways calculate fuzzy hash for text parts
* Store text parts separately from other parts
* Add compare_parts_distance for expressions that calculates difference in 2 parts messages
* Do not try to substitute variables in empty strings
-rw-r--r-- | README.utf8.txt | 23 | ||||
-rw-r--r-- | perl/Makefile.PL.in | 3 | ||||
-rw-r--r-- | perl/Rspamd.pod | 40 | ||||
-rw-r--r-- | perl/Rspamd.xs | 3 | ||||
-rw-r--r-- | perl/Rspamd/Task.xs | 21 | ||||
-rw-r--r-- | perl/Rspamd/TextPart.xs | 40 | ||||
-rw-r--r-- | perl/typemap | 1 | ||||
-rw-r--r-- | src/cfg_utils.c | 5 | ||||
-rw-r--r-- | src/expressions.c | 64 | ||||
-rw-r--r-- | src/filter.c | 12 | ||||
-rw-r--r-- | src/fuzzy.c | 13 | ||||
-rw-r--r-- | src/fuzzy.h | 1 | ||||
-rw-r--r-- | src/main.h | 1 | ||||
-rw-r--r-- | src/message.c | 19 | ||||
-rw-r--r-- | src/message.h | 8 | ||||
-rw-r--r-- | test/rspamd_fuzzy_test.c | 38 |
16 files changed, 273 insertions, 19 deletions
diff --git a/README.utf8.txt b/README.utf8.txt index c5bc293c0..eed4010a2 100644 --- a/README.utf8.txt +++ b/README.utf8.txt @@ -1,7 +1,7 @@ API Rspamd. =========== -TODO. +API rspamd описано подробно в Doxygen документации. Логика работы фильтров rspamd. ============================== @@ -118,3 +118,24 @@ $subject_blah = "Subject=/blah/H"; тогда предыдущее выражение будет таким SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})" + +Логические выражения rspamd +=========================== + +Условия, содержащие регулярные выражения, функции, логические операции, скобки, могут использоваться +для задания правил фильтрации. Общие правила работы: +- логическими операциями могут быть логическое "И": '&', логическое "ИЛИ": '|' и логическое отрицание: +'!'. +- приоритет логических операций такой: & и | -> !, для изменения приоритета можно пользоваться скобками: + (A&!B) | !(C|D) +- пробелы в выражениях игнорируются +- операнд, содержащий /re/args или же string=/re/args считается регулярным выражением, внутри регулярного +выражения все символы '/' и '"' должны экранироваться символом '\'. Сам '\' при этом экранировать не нужно. +- операнд, который принимает аргументы, считается функцией, аргументом функции может являться другая функция, +при этом порядок вызова функций-аргументов - справа налево (как это сделано в gcc) +- в rspamd встроен ряд функций: + * header_exists - принимает в качестве аргумента имя хедера, возвращает true, если такой заголовок существует + * compare_parts_distance - принимает в качестве аргумента число от 0 до 100, которое отражает разницу в процентах + между частями письма. Функция работает с сообщениями, содержащими 2 текстовые части (text/plain и text/html) и + возвращает true тогда, когда эти части различаются более чем на n процентов. Если аргумент не указан, то + по умолчанию ищется различие в 100% (полностью разные части). diff --git a/perl/Makefile.PL.in b/perl/Makefile.PL.in index 0d70d4270..10ca63e16 100644 --- a/perl/Makefile.PL.in +++ b/perl/Makefile.PL.in @@ -1,7 +1,7 @@ use ExtUtils::MakeMaker; WriteMakefile( NAME => 'Mail::Rspamd', - AUTHOR => 'Vsevolod Stakhov <vsevolod@rambler-co.ru>', + AUTHOR => 'Vsevolod Stakhov <vsevolod@highsecure.ru>', XS => { 'Rspamd.xs' => 'Rspamd.c' }, VERSION_FROM => 'Rspamd.pm', # finds $VERSION LIBS => ['${GLIB_LDFLAGS} ${GMIME_LDFLAGS} -levent'], # e.g., '-lm' @@ -15,6 +15,7 @@ WriteMakefile( 'Rspamd.c' => qw{ Rspamd/ContentType.xs Rspamd/Part.xs Rspamd/Hash.xs Rspamd/InternetAddress.xs Rspamd/Message.xs Rspamd/Object.xs +Rspamd/TextPart.xs }, }, ); diff --git a/perl/Rspamd.pod b/perl/Rspamd.pod index d40574820..2af9b4965 100644 --- a/perl/Rspamd.pod +++ b/perl/Rspamd.pod @@ -197,7 +197,7 @@ E.g.: Mail::Rspamd::Header is a private structure. This structure contains all the headers except special ones (Content-* MIME-Version). -Look for L<Header tied hash> for easy maintaining for header. +Look for Header tied hash for easy maintaining for header. Use also the Mail::Rspamd::Message::get_header() and set_header() methods. =back @@ -476,6 +476,10 @@ Return Mail::Rspamd::Config object. Return message's urls as array of strings. +=item I<get_text_parts> () + +Return message's text parts as array of Mail::Rspamd::TextPart objects. + =back =head2 Mail::Rspamd::Config @@ -493,15 +497,23 @@ Gets and sets specified parameter in config. =item I<get_metric> (metric) Returns hash of parameters of specified metric: + +=begin text + { 'name' => name of metric 'func_name' => consolidation function 'required_score' => score for metric } +=end text + =item I<get_statfile> (statfile) Returns parameters of specified statfile: + +=begin text + { 'alias' => alias of statfile 'pattern' => fs pattern @@ -510,12 +522,38 @@ Returns parameters of specified statfile: 'size' => size of statfile } +=end text + =item I<get_module_param> (modulename, paramname) Return parameter's value for specified module. =back +=head2 Mail::Rspamd::TextPart + +Object that represent a single text part of message. + +=over 4 + +=item I<get_content> () + +Returns content of part. + +=item I<get_fuzzy> () + +Returns fuzzy hash of part as string. + +=item I<compare_distance> (other) + +Calculate distance between two parts using their fuzzy hashes. Return value from 0 (identical) to 100 (totally different). + +=item I<is_html> () + +Return 0 if part is plain text and not 0 otherwise. + +=back + =head1 CONSTANT VARIABLES GMIME_LENGTH_ENCODED diff --git a/perl/Rspamd.xs b/perl/Rspamd.xs index 9231d3e62..892e4b006 100644 --- a/perl/Rspamd.xs +++ b/perl/Rspamd.xs @@ -14,6 +14,7 @@ #include "../src/cfg_file.h" #include "../src/perl.h" #include "../src/mem_pool.h" +#include "../src/fuzzy.h" #define XSINTERFACE_FUNC_RSPAMD_MESSAGE_SET(cv,f) \ CvXSUBANY(cv).any_dptr = (void (*) (pTHX_ void*))(CAT2( g_mime_message_,f )) @@ -47,6 +48,7 @@ typedef GMimePartEncodingType Mail__Rspamd__PartEncodingType; typedef GMimeObject * Mail__Rspamd__Object; typedef GMimeParam * Mail__Rspamd__Param; typedef GMimePart * Mail__Rspamd__Part; +typedef struct mime_text_part * Mail__Rspamd__TextPart; typedef GMimeParser * Mail__Rspamd__Parser; typedef GMimeMultipart * Mail__Rspamd__MultiPart; typedef GMimeMessage * Mail__Rspamd__Message; @@ -401,5 +403,6 @@ INCLUDE: Rspamd/Message.xs INCLUDE: Rspamd/InternetAddress.xs INCLUDE: Rspamd/Hash.xs +INCLUDE: Rspamd/TextPart.xs diff --git a/perl/Rspamd/Task.xs b/perl/Rspamd/Task.xs index 16719ef7b..31928bf7b 100644 --- a/perl/Rspamd/Task.xs +++ b/perl/Rspamd/Task.xs @@ -81,3 +81,24 @@ rspamd_task_get_urls (task) OUTPUT: RETVAL +AV* +rspamd_task_get_text_parts (task) + Mail::Rspamd::Task task + PREINIT: + AV* retav; + GList *cur; + SV* ps; + CODE: + retav = newAV (); + cur = g_list_first (task->text_parts); + while (cur) { + ps = newSViv (0); + sv_setref_pv (ps, "Mail::Rspamd::TextPart", (Mail__Rspamd__TextPart)(cur->data)); + av_push(retav, ps); + cur = g_list_next (task->text_parts); + } + + RETVAL = retav; + OUTPUT: + RETVAL + diff --git a/perl/Rspamd/TextPart.xs b/perl/Rspamd/TextPart.xs new file mode 100644 index 000000000..485ee7bcc --- /dev/null +++ b/perl/Rspamd/TextPart.xs @@ -0,0 +1,40 @@ +MODULE = Mail::Rspamd PACKAGE = Mail::Rspamd::TextPart PREFIX = rspamd_text_part_ + +SV * +rspamd_text_part_get_content (mime_part) + Mail::Rspamd::TextPart mime_part + PREINIT: + SV* content; + CODE: + ST(0) = &PL_sv_undef; + content = sv_newmortal (); + SvUPGRADE (content, SVt_PV); + SvREADONLY_on (content); + SvPVX(content) = (char *) (mime_part->content->data); + SvCUR_set (content, mime_part->content->len); + SvLEN_set (content, 0); + SvPOK_only (content); + ST(0) = content; + +char * +rspamd_text_part_get_fuzzy (mime_part) + Mail::Rspamd::TextPart mime_part + CODE: + RETVAL = mime_part->fuzzy->hash_pipe; + +int +rspamd_text_part_compare_distance (mime_part, other) + Mail::Rspamd::TextPart mime_part + Mail::Rspamd::TextPart other + CODE: + RETVAL = fuzzy_compare_hashes (mime_part->fuzzy, other->fuzzy); + OUTPUT: + RETVAL + +int +rspamd_text_part_is_html (mime_part) + Mail::Rspamd::TextPart mime_part + CODE: + RETVAL = mime_part->is_html; + OUTPUT: + RETVAL diff --git a/perl/typemap b/perl/typemap index fa2dabb43..0ae0ef7de 100644 --- a/perl/typemap +++ b/perl/typemap @@ -20,6 +20,7 @@ Mail::Rspamd::Object T_PTROBJ Mail::Rspamd::Param T_PTROBJ Mail::Rspamd::Message T_PTROBJ Mail::Rspamd::Part T_PTROBJ +Mail::Rspamd::TextPart T_PTROBJ Mail::Rspamd::ContentType T_PTROBJ Mail::Rspamd::InternetAddress T_PTROBJ Mail::Rspamd::Hash::Header T_PTROBJ diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 1eeb518ed..037f23754 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -355,6 +355,11 @@ substitute_variable (struct config_file *cfg, char *str, u_char recursive) char *var, *new, *v_begin, *v_end; size_t len; + if (str == NULL) { + yywarn ("substitute_variable: trying to substitute variable in NULL string"); + return NULL; + } + while ((v_begin = strstr (str, "${")) != NULL) { len = strlen (str); *v_begin = '\0'; diff --git a/src/expressions.c b/src/expressions.c index 5cb30e4c3..eefd11f78 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -26,12 +26,15 @@ #include "util.h" #include "cfg_file.h" #include "main.h" +#include "message.h" +#include "fuzzy.h" #include "expressions.h" typedef gboolean (*rspamd_internal_func_t)(struct worker_task *, GList *args); gboolean rspamd_compare_encoding (struct worker_task *task, GList *args); gboolean rspamd_header_exists (struct worker_task *task, GList *args); +gboolean rspamd_parts_distance (struct worker_task *task, GList *args); /* * List of internal functions of rspamd * Sorted by name to use bsearch @@ -41,6 +44,7 @@ static struct _fl { rspamd_internal_func_t func; } rspamd_functions_list[] = { { "compare_encoding", rspamd_compare_encoding }, + { "compare_parts_distance", rspamd_parts_distance }, { "header_exists", rspamd_header_exists }, }; @@ -273,7 +277,9 @@ parse_expression (memory_pool_t *pool, char *line) case READ_REGEXP: if (*p == '/' && *(p - 1) != '\\') { - p ++; + if (*(p + 1)) { + p ++; + } state = READ_REGEXP_FLAGS; } else { @@ -285,14 +291,17 @@ parse_expression (memory_pool_t *pool, char *line) if (!is_regexp_flag (*p) || *(p + 1) == '\0') { if (c != p) { /* Copy operand */ - str = memory_pool_alloc (pool, p - c + 3); - g_strlcpy (str, c - 1, (p - c + 3)); + if (*(p + 1) == '\0') { + p++; + } + str = memory_pool_alloc (pool, p - c + 2); + g_strlcpy (str, c - 1, (p - c + 2)); g_strstrip (str); if (strlen (str) > 0) { insert_expression (pool, &expr, EXPR_REGEXP, 0, str); } } - c = ++p; + c = p; state = SKIP_SPACES; } else { @@ -594,5 +603,52 @@ rspamd_header_exists (struct worker_task *task, GList *args) } /* + * This function is designed to find difference between text/html and text/plain parts + * It takes one argument: difference threshold, if we have two text parts, compare + * its hashes and check for threshold, if value is greater than threshold, return TRUE + * and return FALSE otherwise. + */ +gboolean +rspamd_parts_distance (struct worker_task *task, GList *args) +{ + int threshold; + struct mime_text_part *p1, *p2; + GList *cur; + + if (args == NULL) { + msg_debug ("rspamd_parts_distance: no threshold is specified, assume it 100"); + threshold = 100; + } + else { + errno = 0; + threshold = strtoul ((char *)args->data, NULL, 10); + if (errno != 0) { + msg_info ("rspamd_parts_distance: bad numeric value for threshold \"%s\", assume it 100", (char *)args->data); + threshold = 100; + } + } + + if (g_list_length (task->text_parts) == 2) { + cur = g_list_first (task->text_parts); + p1 = cur->data; + cur = g_list_next (cur); + if (cur == NULL) { + msg_info ("rspamd_parts_distance: bad parts list"); + return FALSE; + } + p2 = cur->data; + if (fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy) >= threshold) { + return TRUE; + } + } + else { + msg_debug ("rspamd_parts_distance: message has too many text parts, so do not try to compare them with each other"); + return FALSE; + } + + return FALSE; +} + +/* * vi:ts=4 */ diff --git a/src/filter.c b/src/filter.c index 766cd16e4..1b6cdc1b0 100644 --- a/src/filter.c +++ b/src/filter.c @@ -437,10 +437,10 @@ statfiles_callback (gpointer key, gpointer value, void *arg) struct classifier *classifier; struct statfile_result_data *res_data; struct metric *metric; + struct mime_text_part *text_part; GTree *tokens = NULL; - GList *cur = NULL; - GByteArray *content; + GList *cur; char *filename; f_str_t c; @@ -457,10 +457,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg) return; } + cur = g_list_first (task->text_parts); if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) { - while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) { - c.begin = content->data; - c.len = content->len; + while (cur != NULL) { + text_part = (struct mime_text_part *)cur->data; + c.begin = text_part->content->data; + c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) { msg_info ("statfiles_callback: cannot tokenize input"); diff --git a/src/fuzzy.c b/src/fuzzy.c index 08814eaa1..4dfec3fb7 100644 --- a/src/fuzzy.c +++ b/src/fuzzy.c @@ -95,7 +95,7 @@ fuzzy_update (fuzzy_hash_t *h, char c) if (h->rh % h->block_size == (h->block_size - 1)) { h->hash_pipe[h->hi] = h->h; - if (h->hi < FUZZY_HASHLEN - 1) { + if (h->hi < FUZZY_HASHLEN - 2) { h->h = HASH_INIT; h->hi ++; } @@ -249,6 +249,17 @@ fuzzy_init (f_str_t *in, memory_pool_t *pool) return new; } +fuzzy_hash_t * +fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool) +{ + f_str_t f; + + f.begin = in->data; + f.len = in->len; + + return fuzzy_init (&f, pool); +} + /* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ int fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2) diff --git a/src/fuzzy.h b/src/fuzzy.h index 91e6512c6..50d1a9110 100644 --- a/src/fuzzy.h +++ b/src/fuzzy.h @@ -27,6 +27,7 @@ typedef struct fuzzy_hash_s { * @return fuzzy_hash object allocated in pool */ fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool); +fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool); /** * Compare score of difference between two hashes diff --git a/src/main.h b/src/main.h index 28eb64297..e26ab3fda 100644 --- a/src/main.h +++ b/src/main.h @@ -174,6 +174,7 @@ struct worker_task { int parts_count; /**< mime parts count */ GMimeMessage *message; /**< message, parsed with GMime */ GList *parts; /**< list of parsed parts */ + GList *text_parts; /**< list of text parts */ char *raw_headers; /**< list of raw headers */ TAILQ_HEAD (uriq, uri) urls; /**< list of parsed urls */ GHashTable *results; /**< hash table of metric_result indexed by diff --git a/src/message.c b/src/message.c index 76743f7de..807463a82 100644 --- a/src/message.c +++ b/src/message.c @@ -242,6 +242,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data) { struct worker_task *task = (struct worker_task *)user_data; struct mime_part *mime_part; + struct mime_text_part *text_part; GMimeContentType *type; GMimeDataWrapper *wrapper; GMimeStream *part_stream; @@ -302,13 +303,27 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data) mime_part->content = part_content; msg_debug ("mime_foreach_callback: found part with content-type: %s/%s", type->type, type->subtype); task->parts = g_list_prepend (task->parts, mime_part); - if (g_mime_content_type_is_type (type, "text", "html")) { + /* Now do special processing for text parts of message */ + if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { msg_debug ("mime_foreach_callback: got urls from text/html part"); url_parse_html (task, part_content); + + text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); + text_part->content = strip_html_tags (part_content, NULL); + text_part->is_html = TRUE; + text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); + task->text_parts = g_list_prepend (task->text_parts, text_part); } else if (g_mime_content_type_is_type (type, "text", "plain")) { - url_parse_text (task, part_content); msg_debug ("mime_foreach_callback: got urls from text/plain part"); + url_parse_text (task, part_content); + + text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); + text_part->content = part_content; + text_part->is_html = FALSE; + text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + task->text_parts = g_list_prepend (task->text_parts, text_part); } } else { diff --git a/src/message.h b/src/message.h index 1122e7e3b..eaf9f493e 100644 --- a/src/message.h +++ b/src/message.h @@ -7,11 +7,17 @@ #define RSPAMD_MESSAGE_H #include "config.h" +#include "fuzzy.h" struct mime_part { GMimeContentType *type; GByteArray *content; - TAILQ_ENTRY (mime_part) next; +}; + +struct mime_text_part { + gboolean is_html; + GByteArray *content; + fuzzy_hash_t *fuzzy; }; /** diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c index d737a9171..9feeb4500 100644 --- a/test/rspamd_fuzzy_test.c +++ b/test/rspamd_fuzzy_test.c @@ -21,24 +21,56 @@ static char *s2 = "This is sample test text.\r\n" "abcdefghijklmnopqrstuvwx.\r\n" "abcdefghijklmnopqrstuvwx.\r\n" "abcdefghijklmnopqrstuvwx.\r\n"; +static char *s3 = ""; +static char *s4 = "abcdefghijklmn\r\n"; +static char *s5 = "This is sample test text.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopzrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n"; void rspamd_fuzzy_test_func () { memory_pool_t *pool; - fuzzy_hash_t *h1, *h2; - f_str_t f1, f2; + fuzzy_hash_t *h1, *h2, *h3, *h4, *h5; + f_str_t f1, f2, f3, f4, f5; + int diff1, diff2; pool = memory_pool_new (1024); f1.begin = s1; f1.len = strlen (s1); f2.begin = s2; f2.len = strlen (s2); + f3.begin = s3; + f3.len = strlen (s3); + f4.begin = s4; + f4.len = strlen (s4); + f5.begin = s5; + f5.len = strlen (s5); h1 = fuzzy_init (&f1, pool); h2 = fuzzy_init (&f2, pool); + h3 = fuzzy_init (&f3, pool); + h4 = fuzzy_init (&f4, pool); + h5 = fuzzy_init (&f5, pool); - msg_info ("rspamd_fuzzy_test_func: difference between strings is %d", fuzzy_compare_hashes (h1, h2)); + diff1 = fuzzy_compare_hashes (h3, h4) + fuzzy_compare_hashes (h2, h4); + diff2 = fuzzy_compare_hashes (h2, h5); + msg_debug ("rspamd_fuzzy_test_func: s1, s2 difference between strings is %d", fuzzy_compare_hashes (h1, h2)); + msg_debug ("rspamd_fuzzy_test_func: s1, s3 difference between strings is %d", fuzzy_compare_hashes (h1, h3)); + msg_debug ("rspamd_fuzzy_test_func: s3, s4 difference between strings is %d", fuzzy_compare_hashes (h3, h4)); + msg_debug ("rspamd_fuzzy_test_func: s2, s4 difference between strings is %d", fuzzy_compare_hashes (h2, h4)); + msg_debug ("rspamd_fuzzy_test_func: s2, s5 difference between strings is %d", diff2); + + /* Identical strings */ + g_assert (diff2 == 0); + /* Totally different strings */ + g_assert (diff1 == 200); memory_pool_delete (pool); } |