From: Vsevolod Stakhov <vsevolod@rambler-co.ru>
Date: Wed, 15 Apr 2009 13:01:01 +0000 (+0400)
Subject: * For mime parts set flag 'raw' and if we cannot determine charset of part or cannot
X-Git-Tag: 0.2.7~195
X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=d50dff03fdf56db4a24cf44e4e9eec70c69e81c3;p=rspamd.git

* For mime parts set flag 'raw' and if we cannot determine charset of part or cannot
  encode it to utf8 just use for such parts raw regexps
---

diff --git a/src/cfg_file.h b/src/cfg_file.h
index b1cbd6125..c91b419ba 100644
--- a/src/cfg_file.h
+++ b/src/cfg_file.h
@@ -77,6 +77,7 @@ struct rspamd_regexp {
 	enum rspamd_regexp_type type;					/**< regexp type										*/
 	char *regexp_text;								/**< regexp text representation							*/
 	GRegex *regexp;									/**< glib regexp structure								*/
+	GRegex *raw_regexp;								/**< glib regexp structure for raw matching				*/
 	char *header;									/**< header name for header regexps						*/
 };
 
diff --git a/src/expressions.c b/src/expressions.c
index 8dfdc17b3..085708342 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -632,12 +632,20 @@ parse_regexp (memory_pool_t *pool, char *line)
 	result->regexp = g_regex_new (begin, regexp_flags, 0, &err);
 	result->regexp_text = memory_pool_strdup (pool, begin);
 	memory_pool_add_destructor (pool, (pool_destruct_func)g_regex_unref, (void *)result->regexp);
-	*end = '/';
 
 	if (result->regexp == NULL || err != NULL) {
+		*end = '/';
 		msg_warn ("parse_regexp: could not read regexp: %s while reading regexp %s", err->message, src);
 		return NULL;
 	}
+	result->raw_regexp = g_regex_new (begin, regexp_flags | G_REGEX_RAW, 0, &err);
+	memory_pool_add_destructor (pool, (pool_destruct_func)g_regex_unref, (void *)result->raw_regexp);
+	*end = '/';
+
+	if (result->raw_regexp == NULL || err != NULL) {
+		msg_warn ("parse_regexp: could not read raw regexp: %s while reading regexp %s", err->message, src);
+		return NULL;
+	}
 	
 	/* Add to cache for further usage */
 	re_cache_add (result->regexp_text, result);
diff --git a/src/message.c b/src/message.c
index 32d9bd673..14f9245cb 100644
--- a/src/message.c
+++ b/src/message.c
@@ -235,7 +235,7 @@ free_byte_array_callback (void *pointer)
 }
 
 static GByteArray *
-convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type)
+convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type, struct mime_text_part *text_part)
 {
 	GError *err = NULL;
 	gsize read_bytes, write_bytes;
@@ -244,10 +244,12 @@ convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeCo
 	GByteArray *result_array;
 
 	if ((charset = g_mime_content_type_get_parameter (type, "charset")) == NULL) {
-		charset = "ASCII";
+		text_part->is_raw = TRUE;
+		return part_content;
 	}
 	
 	if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) {
+		text_part->is_raw = TRUE;
 		return part_content;
 	}
 	
@@ -256,6 +258,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeCo
 									  &read_bytes, &write_bytes, &err);
 	if (res_str == NULL) {
 		msg_warn ("convert_text_to_utf: cannot convert from %s to utf8: %s", charset, err ? err->message : "unknown problem");
+		text_part->is_raw = TRUE;
 		return part_content;
 	}
 
@@ -263,6 +266,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeCo
 	result_array->data = res_str;
 	result_array->len = write_bytes + 1;
 	memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, res_str);
+	text_part->is_raw = FALSE;
 
 	return result_array;
 }
@@ -277,7 +281,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 		url_parse_html (task, part_content);
 
 		text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
-		text_part->orig = convert_text_to_utf (task, part_content, type);
+		text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
 		text_part->content = strip_html_tags (part_content, NULL);
 		text_part->is_html = TRUE;
 		text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
@@ -289,7 +293,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 		url_parse_text (task, part_content);
 
 		text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
-		text_part->orig = convert_text_to_utf (task, part_content, type);
+		text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
 		text_part->content = part_content;
 		text_part->is_html = FALSE;
 		text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
diff --git a/src/message.h b/src/message.h
index c67d14589..9e9b5de1f 100644
--- a/src/message.h
+++ b/src/message.h
@@ -16,6 +16,7 @@ struct mime_part {
 
 struct mime_text_part {
 	gboolean is_html;
+	gboolean is_raw;
 	GByteArray *orig;
 	GByteArray *content;
 	fuzzy_hash_t *fuzzy;
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index a05e0e0e0..fa9eafdd4 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -157,6 +157,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 	char *headerv, *c, t;
 	struct mime_text_part *part;
 	GList *cur, *headerlist;
+	GRegex *regexp;
 	struct uri *url;
 	int r;
 
@@ -209,7 +210,13 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
 			cur = g_list_first (task->text_parts);
 			while (cur) {
 				part = (struct mime_text_part *)cur->data;
-				if (g_regex_match_full (re->regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) {
+				if (part->is_raw) {
+					regexp = re->raw_regexp;
+				}
+				else {
+					regexp = re->regexp;
+				}
+				if (g_regex_match_full (regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) {
 					task_cache_add (task, re, 1);
 					return 1;
 				}