* Fix error in expression parser that causes bad errors with expressions that have regexp at the end

* Improve test for fuzzy hashes * Add new object - TextPart to perl XS library that allows access to stripped parts and fuzzy hashes * Add documentation for expressions parser and fot Mail::Rspamd::TextPart * Allways calculate fuzzy hash for text parts * Store text parts separately from other parts * Add compare_parts_distance for expressions that calculates difference in 2 parts messages * Do not try to substitute variables in empty strings
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-03-23 14:10:07 +0300
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-03-23 14:10:07 +0300
commit: c79b5ccd22cbc1c273479f4f88189a18effda533 (patch)
tree: 1741743779a70146a61cd1767936aa43d671e36b /src
parent: afdaddc4d0745a5bcefad73dd74fd4c03ae3de15 (diff)
download: rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.tar.gz
rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.zip
8 files changed, 110 insertions, 13 deletions
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index 1eeb518ed..037f23754 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -355,6 +355,11 @@ substitute_variable (struct config_file *cfg, char *str, u_char recursive)
 	char *var, *new, *v_begin, *v_end;
 	size_t len;
 
+	if (str == NULL) {
+		yywarn ("substitute_variable: trying to substitute variable in NULL string");
+		return NULL;
+	}
+
 	while ((v_begin = strstr (str, "${")) != NULL) {
 		len = strlen (str);
 		*v_begin = '\0';
diff --git a/src/expressions.c b/src/expressions.c
index 5cb30e4c3..eefd11f78 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -26,12 +26,15 @@
 #include "util.h"
 #include "cfg_file.h"
 #include "main.h"
+#include "message.h"
+#include "fuzzy.h"
 #include "expressions.h"
 
 typedef gboolean (*rspamd_internal_func_t)(struct worker_task *, GList *args);
 
 gboolean rspamd_compare_encoding (struct worker_task *task, GList *args);
 gboolean rspamd_header_exists (struct worker_task *task, GList *args);
+gboolean rspamd_parts_distance (struct worker_task *task, GList *args);
 /*
  * List of internal functions of rspamd
  * Sorted by name to use bsearch
@@ -41,6 +44,7 @@ static struct _fl {
 	rspamd_internal_func_t func;
 } rspamd_functions_list[] = {
 	{ "compare_encoding", rspamd_compare_encoding },
+	{ "compare_parts_distance", rspamd_parts_distance },
 	{ "header_exists", rspamd_header_exists },
 };
 
@@ -273,7 +277,9 @@ parse_expression (memory_pool_t *pool, char *line)
 
 			case READ_REGEXP:
 				if (*p == '/' && *(p - 1) != '\\') {
-					p ++;
+					if (*(p + 1)) {
+						p ++;
+					}
 					state = READ_REGEXP_FLAGS;
 				}
 				else {
@@ -285,14 +291,17 @@ parse_expression (memory_pool_t *pool, char *line)
 				if (!is_regexp_flag (*p) || *(p + 1) == '\0') {
 					if (c != p) {
 						/* Copy operand */
-						str = memory_pool_alloc (pool, p - c + 3);
-						g_strlcpy (str, c - 1, (p - c + 3));
+						if (*(p + 1) == '\0') {
+							p++;
+						}
+						str = memory_pool_alloc (pool, p - c + 2);
+						g_strlcpy (str, c - 1, (p - c + 2));
 						g_strstrip (str);
 						if (strlen (str) > 0) {
 							insert_expression (pool, &expr, EXPR_REGEXP, 0, str);
 						}
 					}
-					c = ++p;
+					c = p;
 					state = SKIP_SPACES;
 				}
 				else {
@@ -594,5 +603,52 @@ rspamd_header_exists (struct worker_task *task, GList *args)
 }
 
 /*
+ * This function is designed to find difference between text/html and text/plain parts
+ * It takes one argument: difference threshold, if we have two text parts, compare 
+ * its hashes and check for threshold, if value is greater than threshold, return TRUE
+ * and return FALSE otherwise.
+ */
+gboolean 
+rspamd_parts_distance (struct worker_task *task, GList *args)
+{	
+	int threshold;
+	struct mime_text_part *p1, *p2;
+	GList *cur;
+	
+	if (args == NULL) {
+		msg_debug ("rspamd_parts_distance: no threshold is specified, assume it 100");
+		threshold = 100;
+	}
+	else {
+		errno = 0;
+		threshold = strtoul ((char *)args->data, NULL, 10);
+		if (errno != 0) {
+			msg_info ("rspamd_parts_distance: bad numeric value for threshold \"%s\", assume it 100", (char *)args->data);
+			threshold = 100;
+		}
+	}
+
+	if (g_list_length (task->text_parts) == 2) {
+		cur = g_list_first (task->text_parts);
+		p1 = cur->data;
+		cur = g_list_next (cur);
+		if (cur == NULL) {
+			msg_info ("rspamd_parts_distance: bad parts list");
+			return FALSE;
+		}
+		p2 = cur->data;
+		if (fuzzy_compare_hashes (p1->fuzzy, p2->fuzzy) >= threshold) {
+			return TRUE;
+		}
+	}
+	else {
+		msg_debug ("rspamd_parts_distance: message has too many text parts, so do not try to compare them with each other");
+		return FALSE;
+	}
+
+	return FALSE;
+}
+
+/*
  * vi:ts=4
  */
diff --git a/src/filter.c b/src/filter.c
index 766cd16e4..1b6cdc1b0 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -437,10 +437,10 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
 	struct classifier *classifier;
 	struct statfile_result_data *res_data;
 	struct metric *metric;
+	struct mime_text_part *text_part;
 
 	GTree *tokens = NULL;
-	GList *cur = NULL;
-	GByteArray *content;
+	GList *cur;
 
 	char *filename;
 	f_str_t c;
@@ -457,10 +457,12 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
 		return;
 	}
 	
+	cur = g_list_first (task->text_parts);
 	if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
-		while ((content = get_next_text_part (task->task_pool, task->parts, &cur)) != NULL) {
-			c.begin = content->data;
-			c.len = content->len;
+		while (cur != NULL) {
+			text_part = (struct mime_text_part *)cur->data;
+			c.begin = text_part->content->data;
+			c.len = text_part->content->len;
 			/* Tree would be freed at task pool freeing */
 			if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
 				msg_info ("statfiles_callback: cannot tokenize input");
diff --git a/src/fuzzy.c b/src/fuzzy.c
index 08814eaa1..4dfec3fb7 100644
--- a/src/fuzzy.c
+++ b/src/fuzzy.c
@@ -95,7 +95,7 @@ fuzzy_update (fuzzy_hash_t *h, char c)
     
 	if (h->rh % h->block_size == (h->block_size - 1)) {
 		h->hash_pipe[h->hi] = h->h;
-		if (h->hi < FUZZY_HASHLEN - 1) {
+		if (h->hi < FUZZY_HASHLEN - 2) {
 			h->h = HASH_INIT;
 			h->hi ++;
 		}
@@ -249,6 +249,17 @@ fuzzy_init (f_str_t *in, memory_pool_t *pool)
 	return new;
 }
 
+fuzzy_hash_t *
+fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool)
+{
+	f_str_t f;
+
+	f.begin = in->data;
+	f.len = in->len;
+
+	return fuzzy_init (&f, pool);
+}
+
 /* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
 int
 fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2) 
diff --git a/src/fuzzy.h b/src/fuzzy.h
index 91e6512c6..50d1a9110 100644
--- a/src/fuzzy.h
+++ b/src/fuzzy.h
@@ -27,6 +27,7 @@ typedef struct fuzzy_hash_s {
  * @return fuzzy_hash object allocated in pool
  */
 fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
+fuzzy_hash_t * fuzzy_init_byte_array (GByteArray *in, memory_pool_t *pool);
 
 /**
  * Compare score of difference between two hashes 
diff --git a/src/main.h b/src/main.h
index 28eb64297..e26ab3fda 100644
--- a/src/main.h
+++ b/src/main.h
@@ -174,6 +174,7 @@ struct worker_task {
 	int parts_count;											/**< mime parts count								*/
 	GMimeMessage *message;										/**< message, parsed with GMime						*/
 	GList *parts;												/**< list of parsed parts							*/
+	GList *text_parts;											/**< list of text parts								*/
 	char *raw_headers;											/**< list of raw headers							*/
 	TAILQ_HEAD (uriq, uri) urls;								/**< list of parsed urls							*/
 	GHashTable *results;										/**< hash table of metric_result indexed by 
diff --git a/src/message.c b/src/message.c
index 76743f7de..807463a82 100644
--- a/src/message.c
+++ b/src/message.c
@@ -242,6 +242,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
 {
 	struct worker_task *task = (struct worker_task *)user_data;
 	struct mime_part *mime_part;
+	struct mime_text_part *text_part;
 	GMimeContentType *type;
 	GMimeDataWrapper *wrapper;
 	GMimeStream *part_stream;
@@ -302,13 +303,27 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
 				mime_part->content = part_content;
 				msg_debug ("mime_foreach_callback: found part with content-type: %s/%s", type->type, type->subtype);
 				task->parts = g_list_prepend (task->parts, mime_part);
-				if (g_mime_content_type_is_type (type, "text", "html")) {
+				/* Now do special processing for text parts of message */
+				if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
 					msg_debug ("mime_foreach_callback: got urls from text/html part");
 					url_parse_html (task, part_content);
+
+					text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+					text_part->content = strip_html_tags (part_content, NULL);
+					text_part->is_html = TRUE;
+					text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+					memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+					task->text_parts = g_list_prepend (task->text_parts, text_part);
 				} 
 				else if (g_mime_content_type_is_type (type, "text", "plain")) {
-					url_parse_text (task, part_content);
 					msg_debug ("mime_foreach_callback: got urls from text/plain part");
+					url_parse_text (task, part_content);
+
+					text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+					text_part->content = part_content;
+					text_part->is_html = FALSE;
+					text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+					task->text_parts = g_list_prepend (task->text_parts, text_part);
 				}
 			}
 			else {
diff --git a/src/message.h b/src/message.h
index 1122e7e3b..eaf9f493e 100644
--- a/src/message.h
+++ b/src/message.h
@@ -7,11 +7,17 @@
 #define RSPAMD_MESSAGE_H
 
 #include "config.h"
+#include "fuzzy.h"
 
 struct mime_part {
 	GMimeContentType *type;
 	GByteArray *content;
-	TAILQ_ENTRY (mime_part) next;
+};
+
+struct mime_text_part {
+	gboolean is_html;
+	GByteArray *content;
+	fuzzy_hash_t *fuzzy;
 };
 
 /**
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-03-23 14:10:07 +0300
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-03-23 14:10:07 +0300
commit	c79b5ccd22cbc1c273479f4f88189a18effda533 (patch)
tree	1741743779a70146a61cd1767936aa43d671e36b /src
parent	afdaddc4d0745a5bcefad73dd74fd4c03ae3de15 (diff)
download	rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.tar.gz rspamd-c79b5ccd22cbc1c273479f4f88189a18effda533.zip