From: Vsevolod Stakhov Date: Wed, 15 Apr 2009 13:01:01 +0000 (+0400) Subject: * For mime parts set flag 'raw' and if we cannot determine charset of part or cannot X-Git-Tag: 0.2.7~195 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=d50dff03fdf56db4a24cf44e4e9eec70c69e81c3;p=rspamd.git * For mime parts set flag 'raw' and if we cannot determine charset of part or cannot encode it to utf8 just use for such parts raw regexps --- diff --git a/src/cfg_file.h b/src/cfg_file.h index b1cbd6125..c91b419ba 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -77,6 +77,7 @@ struct rspamd_regexp { enum rspamd_regexp_type type; /**< regexp type */ char *regexp_text; /**< regexp text representation */ GRegex *regexp; /**< glib regexp structure */ + GRegex *raw_regexp; /**< glib regexp structure for raw matching */ char *header; /**< header name for header regexps */ }; diff --git a/src/expressions.c b/src/expressions.c index 8dfdc17b3..085708342 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -632,12 +632,20 @@ parse_regexp (memory_pool_t *pool, char *line) result->regexp = g_regex_new (begin, regexp_flags, 0, &err); result->regexp_text = memory_pool_strdup (pool, begin); memory_pool_add_destructor (pool, (pool_destruct_func)g_regex_unref, (void *)result->regexp); - *end = '/'; if (result->regexp == NULL || err != NULL) { + *end = '/'; msg_warn ("parse_regexp: could not read regexp: %s while reading regexp %s", err->message, src); return NULL; } + result->raw_regexp = g_regex_new (begin, regexp_flags | G_REGEX_RAW, 0, &err); + memory_pool_add_destructor (pool, (pool_destruct_func)g_regex_unref, (void *)result->raw_regexp); + *end = '/'; + + if (result->raw_regexp == NULL || err != NULL) { + msg_warn ("parse_regexp: could not read raw regexp: %s while reading regexp %s", err->message, src); + return NULL; + } /* Add to cache for further usage */ re_cache_add (result->regexp_text, result); diff --git a/src/message.c b/src/message.c index 32d9bd673..14f9245cb 100644 --- a/src/message.c +++ b/src/message.c @@ -235,7 +235,7 @@ free_byte_array_callback (void *pointer) } static GByteArray * -convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type) +convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type, struct mime_text_part *text_part) { GError *err = NULL; gsize read_bytes, write_bytes; @@ -244,10 +244,12 @@ convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeCo GByteArray *result_array; if ((charset = g_mime_content_type_get_parameter (type, "charset")) == NULL) { - charset = "ASCII"; + text_part->is_raw = TRUE; + return part_content; } if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) { + text_part->is_raw = TRUE; return part_content; } @@ -256,6 +258,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeCo &read_bytes, &write_bytes, &err); if (res_str == NULL) { msg_warn ("convert_text_to_utf: cannot convert from %s to utf8: %s", charset, err ? err->message : "unknown problem"); + text_part->is_raw = TRUE; return part_content; } @@ -263,6 +266,7 @@ convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeCo result_array->data = res_str; result_array->len = write_bytes + 1; memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, res_str); + text_part->is_raw = FALSE; return result_array; } @@ -277,7 +281,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont url_parse_html (task, part_content); text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); - text_part->orig = convert_text_to_utf (task, part_content, type); + text_part->orig = convert_text_to_utf (task, part_content, type, text_part); text_part->content = strip_html_tags (part_content, NULL); text_part->is_html = TRUE; text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); @@ -289,7 +293,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont url_parse_text (task, part_content); text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); - text_part->orig = convert_text_to_utf (task, part_content, type); + text_part->orig = convert_text_to_utf (task, part_content, type, text_part); text_part->content = part_content; text_part->is_html = FALSE; text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); diff --git a/src/message.h b/src/message.h index c67d14589..9e9b5de1f 100644 --- a/src/message.h +++ b/src/message.h @@ -16,6 +16,7 @@ struct mime_part { struct mime_text_part { gboolean is_html; + gboolean is_raw; GByteArray *orig; GByteArray *content; fuzzy_hash_t *fuzzy; diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index a05e0e0e0..fa9eafdd4 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -157,6 +157,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) char *headerv, *c, t; struct mime_text_part *part; GList *cur, *headerlist; + GRegex *regexp; struct uri *url; int r; @@ -209,7 +210,13 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) cur = g_list_first (task->text_parts); while (cur) { part = (struct mime_text_part *)cur->data; - if (g_regex_match_full (re->regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) { + if (part->is_raw) { + regexp = re->raw_regexp; + } + else { + regexp = re->regexp; + } + if (g_regex_match_full (regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) { task_cache_add (task, re, 1); return 1; }