From: Vsevolod Stakhov Date: Wed, 15 Apr 2009 12:24:55 +0000 (+0400) Subject: * Try to convert each text part of messages to utf8 to avoid problems with regexps X-Git-Tag: 0.2.7~198 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=6663e20a1a0e14eba4cb31d3e3bb19d989180c54;p=rspamd.git * Try to convert each text part of messages to utf8 to avoid problems with regexps --- diff --git a/src/main.c b/src/main.c index 348e7a1ea..fcf3f3667 100644 --- a/src/main.c +++ b/src/main.c @@ -392,6 +392,7 @@ main (int argc, char **argv, char **env) int res = 0, i, listen_sock, lmtp_listen_sock; struct sigaction signals; struct rspamd_worker *cur, *cur_tmp, *active_worker; + struct rlimit rlim; FILE *f; pid_t wrk; #ifndef WITHOUT_PERL @@ -506,6 +507,11 @@ main (int argc, char **argv, char **env) /* Drop privilleges */ drop_priv (cfg); + + /* Set stack size for pcre */ + getrlimit(RLIMIT_STACK, &rlim); + rlim.rlim_cur = 100 * 1024 * 1024; + setrlimit(RLIMIT_STACK, &rlim); config_logger (rspamd, TRUE); diff --git a/src/message.c b/src/message.c index 4d127d39d..0d4844141 100644 --- a/src/message.c +++ b/src/message.c @@ -234,6 +234,69 @@ free_byte_array_callback (void *pointer) g_byte_array_free (arr, TRUE); } +static GByteArray * +convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type) +{ + GError *err = NULL; + gsize read_bytes, write_bytes; + const char *charset; + gchar *res_str; + GByteArray *result_array; + + if ((charset = g_mime_content_type_get_parameter (type, "charset")) == NULL) { + charset = "ASCII"; + } + + if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) { + return part_content; + } + + res_str = g_convert_with_fallback (part_content->data, part_content->len, + "UTF-8", charset, NULL, + &read_bytes, &write_bytes, &err); + if (res_str == NULL) { + msg_warn ("convert_text_to_utf: cannot convert from %s to utf8: %s", charset, err ? err->message : "unknown problem"); + return part_content; + } + + result_array = g_malloc (sizeof (GByteArray)); + result_array->data = res_str; + result_array->len = write_bytes + 1; + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, result_array); + + return result_array; +} + +static void +process_text_part (struct worker_task *task, GByteArray *part_content, GMimeContentType *type) +{ + struct mime_text_part *text_part; + + if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { + msg_debug ("mime_foreach_callback: got urls from text/html part"); + url_parse_html (task, part_content); + + text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); + text_part->orig = convert_text_to_utf (task, part_content, type); + text_part->content = strip_html_tags (part_content, NULL); + text_part->is_html = TRUE; + text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); + task->text_parts = g_list_prepend (task->text_parts, text_part); + } + else if (g_mime_content_type_is_type (type, "text", "plain")) { + msg_debug ("mime_foreach_callback: got urls from text/plain part"); + url_parse_text (task, part_content); + + text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); + text_part->orig = convert_text_to_utf (task, part_content, type); + text_part->content = part_content; + text_part->is_html = FALSE; + text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + task->text_parts = g_list_prepend (task->text_parts, text_part); + } +} + #ifdef GMIME24 static void mime_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data) @@ -244,7 +307,6 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data) { struct worker_task *task = (struct worker_task *)user_data; struct mime_part *mime_part; - struct mime_text_part *text_part; GMimeContentType *type; GMimeDataWrapper *wrapper; GMimeStream *part_stream; @@ -310,30 +372,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data) task->parts = g_list_prepend (task->parts, mime_part); /* Skip empty parts */ if (part_content->len > 0) { - /* Now do special processing for text parts of message */ - if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { - msg_debug ("mime_foreach_callback: got urls from text/html part"); - url_parse_html (task, part_content); - - text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); - text_part->orig = part_content; - text_part->content = strip_html_tags (part_content, NULL); - text_part->is_html = TRUE; - text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); - memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); - task->text_parts = g_list_prepend (task->text_parts, text_part); - } - else if (g_mime_content_type_is_type (type, "text", "plain")) { - msg_debug ("mime_foreach_callback: got urls from text/plain part"); - url_parse_text (task, part_content); - - text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); - text_part->orig = part_content; - text_part->content = part_content; - text_part->is_html = FALSE; - text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); - task->text_parts = g_list_prepend (task->text_parts, text_part); - } + process_text_part (task, part_content, type); } } else { diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 573d370f9..a05e0e0e0 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -174,7 +174,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) return 0; case REGEXP_HEADER: if (re->header == NULL) { - msg_info ("process_regexp: header regexp without header name"); + msg_info ("process_regexp: header regexp without header name: '%s'", re->regexp_text); task_cache_add (task, re, 0); return 0; }