* Fix error in expression parser that causes bad errors with expressions that have regexp at the end
* Improve test for fuzzy hashes * Add new object - TextPart to perl XS library that allows access to stripped parts and fuzzy hashes * Add documentation for expressions parser and fot Mail::Rspamd::TextPart * Allways calculate fuzzy hash for text parts * Store text parts separately from other parts * Add compare_parts_distance for expressions that calculates difference in 2 parts messages * Do not try to substitute variables in empty strings
diff --git a/README.utf8.txt b/README.utf8.txt
index c5bc293c0..eed4010a2 100644
--- a/README.utf8.txt
+++ b/README.utf8.txt
@@ -1,7 +1,7 @@
API Rspamd.
+API rspamd описано подробно в Doxygen документации.
Логика работы фильтров rspamd.
@@ -118,3 +118,24 @@ $subject_blah = "Subject=/blah/H";
тогда предыдущее выражение будет таким
SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})"
+Логические выражения rspamd
+Условия, содержащие регулярные выражения, функции, логические операции, скобки, могут использоваться
+для задания правил фильтрации. Общие правила работы:
+- логическими операциями могут быть логическое "И": '&', логическое "ИЛИ": '|' и логическое отрицание:
+- приоритет логических операций такой: & и | -> !, для изменения приоритета можно пользоваться скобками:
+ (A&!B) | !(C|D)
+- пробелы в выражениях игнорируются
+- операнд, содержащий /re/args или же string=/re/args считается регулярным выражением, внутри регулярного
+выражения все символы '/' и '"' должны экранироваться символом '\'. Сам '\' при этом экранировать не нужно.
+- операнд, который принимает аргументы, считается функцией, аргументом функции может являться другая функция,
+при этом порядок вызова функций-аргументов - справа налево (как это сделано в gcc)
+- в rspamd встроен ряд функций:
+ * header_exists - принимает в качестве аргумента имя хедера, возвращает true, если такой заголовок существует
+ * compare_parts_distance - принимает в качестве аргумента число от 0 до 100, которое отражает разницу в процентах
+ между частями письма. Функция работает с сообщениями, содержащими 2 текстовые части (text/plain и text/html) и
+ возвращает true тогда, когда эти части различаются более чем на n процентов. Если аргумент не указан, то
+ по умолчанию ищется различие в 100% (полностью разные части).
diff --git a/perl/ b/perl/
index 0d70d4270..10ca63e16 100644
--- a/perl/
+++ b/perl/
@@ -1,7 +1,7 @@
use ExtUtils::MakeMaker;
NAME => 'Mail::Rspamd',
- AUTHOR => 'Vsevolod Stakhov <>',
+ AUTHOR => 'Vsevolod Stakhov <>',
XS => { 'Rspamd.xs' => 'Rspamd.c' },
VERSION_FROM => '', # finds $VERSION
LIBS => ['${GLIB_LDFLAGS} ${GMIME_LDFLAGS} -levent'], # e.g., '-lm'
@@ -15,6 +15,7 @@ WriteMakefile(
'Rspamd.c' => qw{
Rspamd/ContentType.xs Rspamd/Part.xs Rspamd/Hash.xs
Rspamd/InternetAddress.xs Rspamd/Message.xs Rspamd/Object.xs
diff --git a/perl/Rspamd.pod b/perl/Rspamd.pod
index d40574820..2af9b4965 100644
--- a/perl/Rspamd.pod
+++ b/perl/Rspamd.pod
@@ -197,7 +197,7 @@ E.g.:
Mail::Rspamd::Header is a private structure. This structure contains
all the headers except special ones (Content-* MIME-Version).
-Look for L<Header tied hash> for easy maintaining for header.
+Look for Header tied hash for easy maintaining for header.
Use also the Mail::Rspamd::Message::get_header() and set_header() methods.
@@ -476,6 +476,10 @@ Return Mail::Rspamd::Config object.
Return message's urls as array of strings.
+=item I<get_text_parts> ()
+Return message's text parts as array of Mail::Rspamd::TextPart objects.
=head2 Mail::Rspamd::Config
@@ -493,15 +497,23 @@ Gets and sets specified parameter in config.
=item I<get_metric> (metric)
Returns hash of parameters of specified metric:
+=begin text
'name' => name of metric
'func_name' => consolidation function
'required_score' => score for metric
+=end text
=item I<get_statfile> (statfile)
Returns parameters of specified statfile:
+=begin text
'alias' => alias of statfile
'pattern' => fs pattern
@@ -510,12 +522,38 @@ Returns parameters of specified statfile:
'size' => size of statfile
+=end text
=item I<get_module_param> (modulename, paramname)
Return parameter's value for specified module.
+=head2 Mail::Rspamd::TextPart
+Object that represent a single text part of message.
+=over 4
+=item I<get_content> ()
+Returns content of part.
+=item I<get_fuzzy> ()
+Returns fuzzy hash of part as string.
+=item I<compare_distance> (other)
+Calculate distance between two parts using their fuzzy hashes. Return value from 0 (identical) to 100 (totally different).
+=item I<is_html> ()
+Return 0 if part is plain text and not 0 otherwise.
diff --git a/perl/Rspamd.xs b/perl/Rspamd.xs
index 9231d3e62..892e4b006 100644
--- a/perl/Rspamd.xs
+++ b/perl/Rspamd.xs
@@ -14,6 +14,7 @@
#include "../src/cfg_file.h"
#include "../src/perl.h"
#include "../src/mem_pool.h"
+#include "../src/fuzzy.h"
CvXSUBANY(cv).any_dptr = (void (*) (pTHX_ void*))(CAT2( g_mime_message_,f ))
@@ -47,6 +48,7 @@ typedef GMimePartEncodingType Mail__Rspamd__PartEncodingType;
typedef GMimeObject * Mail__Rspamd__Object;
typedef GMimeParam * Mail__Rspamd__Param;
typedef GMimePart * Mail__Rspamd__Part;
+typedef struct mime_text_part * Mail__Rspamd__TextPart;
typedef GMimeParser * Mail__Rspamd__Parser;
typedef GMimeMultipart * Mail__Rspamd__MultiPart;
typedef GMimeMessage * Mail__Rspamd__Message;
@@ -401,5 +403,6 @@ INCLUDE: Rspamd/Message.xs
INCLUDE: Rspamd/InternetAddress.xs
INCLUDE: Rspamd/Hash.xs
+INCLUDE: Rspamd/TextPart.xs
diff --git a/perl/Rspamd/Task.xs b/perl/Rspamd/Task.xs
index 16719ef7b..31928bf7b 100644
--- a/perl/Rspamd/Task.xs
+++ b/perl/Rspamd/Task.xs
@@ -81,3 +81,24 @@ rspamd_task_get_urls (task)
+rspamd_task_get_text_parts (task)
+ Mail::Rspamd::Task task
+ AV* retav;
+ GList *cur;
+ SV* ps;
+ retav = newAV ();
+ cur = g_list_first (task->text_parts);
+ while (cur) {
+ ps = newSViv (0);
+ sv_setref_pv (ps, "Mail::Rspamd::TextPart", (Mail__Rspamd__TextPart)(cur->data));
+ av_push(retav, ps);
+ cur = g_list_next (task->text_parts);
+ }
+ RETVAL = retav;
diff --git a/perl/Rspamd/TextPart.xs b/perl/Rspamd/TextPart.xs
new file mode 100644
index 000000000..485ee7bcc
--- /dev/null
+++ b/perl/Rspamd/TextPart.xs
@@ -0,0 +1,40 @@
+MODULE = Mail::Rspamd PACKAGE = Mail::Rspamd::TextPart PREFIX = rspamd_text_part_
+SV *
+rspamd_text_part_get_content (mime_part)
+ Mail::Rspamd::TextPart mime_part
+ SV* content;
+ ST(0) = &PL_sv_undef;
+ content = sv_newmortal ();
+ SvUPGRADE (content, SVt_PV);
+ SvREADONLY_on (content);
+ SvPVX(content) = (char *) (mime_part->content->data);
+ SvCUR_set (content, mime_part->content->len);
+ SvLEN_set (content, 0);
+ SvPOK_only (content);
+ ST(0) = content;
+char *
+rspamd_text_part_get_fuzzy (mime_part)
+ Mail::Rspamd::TextPart mime_part
+ RETVAL = mime_part->fuzzy->hash_pipe;
+rspamd_text_part_compare_distance (mime_part, other)
+ Mail::Rspamd::TextPart mime_part
+ Mail::Rspamd::TextPart other
+ RETVAL = fuzzy_compare_hashes (mime_part->fuzzy, other->fuzzy);
+rspamd_text_part_is_html (mime_part)
+ Mail::Rspamd::TextPart mime_part
+ RETVAL = mime_part->is_html;
diff --git a/perl/typemap b/perl/typemap
index fa2dabb43..0ae0ef7de 100644
--- a/perl/typemap
+++ b/perl/typemap
@@ -20,6 +20,7 @@ Mail::Rspamd::Object T_PTROBJ
Mail::Rspamd::Param T_PTROBJ
Mail::Rspamd::Message T_PTROBJ
Mail::Rspamd::Part T_PTROBJ
+Mail::Rspamd::TextPart T_PTROBJ
Mail::Rspamd::ContentType T_PTROBJ
Mail::Rspamd::InternetAddress T_PTROBJ
Mail::Rspamd::Hash::Header T_PTROBJ
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index 1eeb518ed..037f23754 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -355,6 +355,11 @@ substitute_variable (struct config_file *cfg, char *str, u_char recursive)
char *var, *new, *v_begin, *v_end;
size_t len;
+ if (str == NULL) {
+ yywarn ("substitute_variable: trying to substitute variable in NULL string");
+ return NULL;
+ }
while ((v_begin = strstr (str, "${")) != NULL) {
len = strlen (str);
*v_begin = '\0';
diff --git a/src/expressions.c b/src/expressions.c
index 5cb30e4c3..eefd11f78 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -26,12 +26,15 @@
#include "util.h"
#include "cfg_file.h"
#include "main.h"
+#include "message.h"
+#include "fuzzy.h"
#include "expressions.h"
typedef gboolean (*rspamd_internal_func_t)(struct worker_task *, GList *args);
gboolean rspamd_compare_encoding (struct worker_task *task, GList *args);
gboolean rspamd_header_exists (struct worker_task *task, GList *args);
+gboolean rspamd_parts_distance (struct worker_task *task, GList *args);
* List of internal functions of rspamd
* Sorted by name to use bsearch
@@ -41,6 +44,7 @@ static struct _fl {
rspamd_internal_func_t func;
} rspamd_functions_list[] = {
{ "compare_encoding", rspamd_compare_encoding },
+ { "compare_parts_distance", rspamd_parts_distance },
{ "header_exists", rspamd_header_exists },
@@ -273,7 +277,9 @@ parse_expression (memory_pool_t *pool, char *line)
if (*p == '/' && *(p - 1) != '\\') {
- p ++;
+ if (*(p + 1)) {
+ p ++;
+ }
else {
@@ -285,14 +291,17 @@ parse_expression (memory_pool_t *pool, char *line)
if (!is_regexp_flag (*p) || *(p + 1) == '\0') {
if (c != p) {
/* Copy operand */
- str = memory_pool_alloc (pool, p - c + 3);
- g_strlcpy (str, c - 1, (p - c + 3));
+ if (*(p + 1) == '\0') {
+ p++;
+ }
+ str = memory_pool_alloc (pool, p - c + 2);
+ g_strlcpy (str, c - 1, (p - c + 2));
g_strstrip (str);
if (strlen (str) > 0) {
insert_expression (pool, &expr, EXPR_REGEXP, 0, str);
- c = ++p;
+ c = p;
state = SKIP_SPACES;
else {
@@ -594,5 +603,52 @@ rspamd_header_exists (struct worker_task *task, GList *args)
+ * This function is designed to find difference between text/html and text/plain parts
+ * It takes one argument: difference threshold, if we have two text parts, compare
+ * its hashes and check for threshold, if value is greater than threshold, return TRUE
+ * and return FALSE otherwise.
+ */
+rspamd_parts_distance (struct worker_task *task, GList *args)
+ int threshold;
+ struct mime_text_part *p1, *p2;
+ GList *cur;
+ if (args == NULL) {
+ msg_debug ("rspamd_parts_distance: no threshold is specified, assume it 100");
+ threshold = 100;
+ }
+ else {
+ errno = 0;
+ threshold = strtoul ((char *)args->data, NULL, 10);
+ if (errno != 0) {
+ msg_info ("rspamd_parts_distance: bad numeric value for threshold \"%s\", assume it 100", (char *)args->data);
+ threshold = 100;
+ }
+ }
+ if (g_list_length (task->text_parts) == 2) {
+ cur = g_list_first (task->text_parts);
+ p1 = cur->data;
+ cur = g_list_next (cur);
+ if (cur == NULL) {
+ msg_info ("rspamd_parts_distance: bad parts list");
+ return FALSE;
+ }
+ p2 = cur->data;
+ if (fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy) >= threshold) {
+ return TRUE;
+ }
+ }
+ else {
+ msg_debug ("rspamd_parts_distance: message has too many text parts, so do not try to compare them with each other");
+ return FALSE;
+ }
+ return FALSE;
* vi:ts=4
diff --git a/src/filter.c b/src/filter.c
index 766cd16e4..1b6cdc1b0 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -437,10 +437,10 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
struct classifier *classifier;
struct statfile_result_data *res_data;
struct metric *metric;
+ struct mime_text_part *text_part;
GTree *tokens = NULL;
- GList *cur = NULL;
- GByteArray *content;
+ GList *cur;
char *filename;
f_str_t c;
@@ -457,10 +457,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
+ cur = g_list_first (task->text_parts);
if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
- while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) {
- c.begin = content->data;
- c.len = content->len;
+ while (cur != NULL) {
+ text_part = (struct mime_text_part *)cur->data;
+ c.begin = text_part->content->data;
+ c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
msg_info ("statfiles_callback: cannot tokenize input");
diff --git a/src/fuzzy.c b/src/fuzzy.c
index 08814eaa1..4dfec3fb7 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -95,7 +95,7 @@ fuzzy_update (fuzzy_hash_t *h, char c)
if (h->rh % h->block_size == (h->block_size - 1)) {
h->hash_pipe[h->hi] = h->h;
- if (h->hi < FUZZY_HASHLEN - 1) {
+ if (h->hi < FUZZY_HASHLEN - 2) {
h->h = HASH_INIT;
h->hi ++;
@@ -249,6 +249,17 @@ fuzzy_init (f_str_t *in, memory_pool_t *pool)
return new;
+fuzzy_hash_t *
+fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool)
+ f_str_t f;
+ f.begin = in->data;
+ f.len = in->len;
+ return fuzzy_init (&f, pool);
/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2)
diff --git a/src/fuzzy.h b/src/fuzzy.h
index 91e6512c6..50d1a9110 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -27,6 +27,7 @@ typedef struct fuzzy_hash_s {
* @return fuzzy_hash object allocated in pool
fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
+fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
* Compare score of difference between two hashes
diff --git a/src/main.h b/src/main.h
index 28eb64297..e26ab3fda 100644
--- a/src/main.h
+++ b/src/main.h
@@ -174,6 +174,7 @@ struct worker_task {
int parts_count; /**< mime parts count */
GMimeMessage *message; /**< message, parsed with GMime */
GList *parts; /**< list of parsed parts */
+ GList *text_parts; /**< list of text parts */
char *raw_headers; /**< list of raw headers */
TAILQ_HEAD (uriq, uri) urls; /**< list of parsed urls */
GHashTable *results; /**< hash table of metric_result indexed by
diff --git a/src/message.c b/src/message.c
index 76743f7de..807463a82 100644
--- a/src/message.c
+++ b/src/message.c
@@ -242,6 +242,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
struct worker_task *task = (struct worker_task *)user_data;
struct mime_part *mime_part;
+ struct mime_text_part *text_part;
GMimeContentType *type;
GMimeDataWrapper *wrapper;
GMimeStream *part_stream;
@@ -302,13 +303,27 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
mime_part->content = part_content;
msg_debug ("mime_foreach_callback: found part with content-type: %s/%s", type->type, type->subtype);
task->parts = g_list_prepend (task->parts, mime_part);
- if (g_mime_content_type_is_type (type, "text", "html")) {
+ /* Now do special processing for text parts of message */
+ if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
msg_debug ("mime_foreach_callback: got urls from text/html part");
url_parse_html (task, part_content);
+ text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+ text_part->content = strip_html_tags (part_content, NULL);
+ text_part->is_html = TRUE;
+ text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+ task->text_parts = g_list_prepend (task->text_parts, text_part);
else if (g_mime_content_type_is_type (type, "text", "plain")) {
- url_parse_text (task, part_content);
msg_debug ("mime_foreach_callback: got urls from text/plain part");
+ url_parse_text (task, part_content);
+ text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+ text_part->content = part_content;
+ text_part->is_html = FALSE;
+ text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+ task->text_parts = g_list_prepend (task->text_parts, text_part);
else {
diff --git a/src/message.h b/src/message.h
index 1122e7e3b..eaf9f493e 100644
--- a/src/message.h
+++ b/src/message.h
@@ -7,11 +7,17 @@
#include "config.h"
+#include "fuzzy.h"
struct mime_part {
GMimeContentType *type;
GByteArray *content;
- TAILQ_ENTRY (mime_part) next;
+struct mime_text_part {
+ gboolean is_html;
+ GByteArray *content;
+ fuzzy_hash_t *fuzzy;
diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c
index d737a9171..9feeb4500 100644
--- a/test/rspamd_fuzzy_test.c
+++ b/test/rspamd_fuzzy_test.c
@@ -21,24 +21,56 @@ static char *s2 = "This is sample test text.\r\n"
+static char *s3 = "";
+static char *s4 = "abcdefghijklmn\r\n";
+static char *s5 = "This is sample test text.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopzrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n";
rspamd_fuzzy_test_func ()
memory_pool_t *pool;
- fuzzy_hash_t *h1, *h2;
- f_str_t f1, f2;
+ fuzzy_hash_t *h1, *h2, *h3, *h4, *h5;
+ f_str_t f1, f2, f3, f4, f5;
+ int diff1, diff2;
pool = memory_pool_new (1024);
f1.begin = s1;
f1.len = strlen (s1);
f2.begin = s2;
f2.len = strlen (s2);
+ f3.begin = s3;
+ f3.len = strlen (s3);
+ f4.begin = s4;
+ f4.len = strlen (s4);
+ f5.begin = s5;
+ f5.len = strlen (s5);
h1 = fuzzy_init (&f1, pool);
h2 = fuzzy_init (&f2, pool);
+ h3 = fuzzy_init (&f3, pool);
+ h4 = fuzzy_init (&f4, pool);
+ h5 = fuzzy_init (&f5, pool);
- msg_info ("rspamd_fuzzy_test_func: difference between strings is %d", fuzzy_compare_hashes (h1, h2));
+ diff1 = fuzzy_compare_hashes (h3, h4) + fuzzy_compare_hashes (h2, h4);
+ diff2 = fuzzy_compare_hashes (h2, h5);
+ msg_debug ("rspamd_fuzzy_test_func: s1, s2 difference between strings is %d", fuzzy_compare_hashes (h1, h2));
+ msg_debug ("rspamd_fuzzy_test_func: s1, s3 difference between strings is %d", fuzzy_compare_hashes (h1, h3));
+ msg_debug ("rspamd_fuzzy_test_func: s3, s4 difference between strings is %d", fuzzy_compare_hashes (h3, h4));
+ msg_debug ("rspamd_fuzzy_test_func: s2, s4 difference between strings is %d", fuzzy_compare_hashes (h2, h4));
+ msg_debug ("rspamd_fuzzy_test_func: s2, s5 difference between strings is %d", diff2);
+ /* Identical strings */
+ g_assert (diff2 == 0);
+ /* Totally different strings */
+ g_assert (diff1 == 200);
memory_pool_delete (pool);