* Try to convert each text part of messages to utf8 to avoid problems with regexps

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Wed, 15 Apr 2009 12:24:55 +0000 (16:24 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Wed, 15 Apr 2009 12:24:55 +0000 (16:24 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 15 Apr 2009 12:24:55 +0000 (16:24 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 15 Apr 2009 12:24:55 +0000 (16:24 +0400)
diff --git a/src/main.c b/src/main.c

index 348e7a1ea5fb2ec4a482be357bcd9d6d3984faf5..fcf3f36673c16e633ca2593321186c432d18b44e 100644 (file)
--- a/src/main.c
+++ b/src/main.c
@@ -392,6 +392,7 @@ main (int argc, char **argv, char **env)
         int res = 0, i, listen_sock, lmtp_listen_sock;
         struct sigaction signals;
         struct rspamd_worker *cur, *cur_tmp, *active_worker;
+       struct rlimit rlim;
         FILE *f;
         pid_t wrk;
  #ifndef WITHOUT_PERL
@@ -506,6 +507,11 @@ main (int argc, char **argv, char **env)
  
         /* Drop privilleges */
         drop_priv (cfg);
+
+       /* Set stack size for pcre */
+       getrlimit(RLIMIT_STACK, &rlim);
+       rlim.rlim_cur = 100 * 1024 * 1024;
+       setrlimit(RLIMIT_STACK, &rlim);
         
         config_logger (rspamd, TRUE);
  
diff --git a/src/message.c b/src/message.c

index 4d127d39d95e7c78e333cc5df31be42530675126..0d48441414d10d1c6fb305a937e63e82eeeca54e 100644 (file)
--- a/src/message.c
+++ b/src/message.c
@@ -234,6 +234,69 @@ free_byte_array_callback (void *pointer)
         g_byte_array_free (arr, TRUE);
  }
  
+static GByteArray *
+convert_text_to_utf (struct worker_task *task, GByteArray *part_content, GMimeContentType *type)
+{
+       GError *err = NULL;
+       gsize read_bytes, write_bytes;
+       const char *charset;
+       gchar *res_str;
+       GByteArray *result_array;
+
+       if ((charset = g_mime_content_type_get_parameter (type, "charset")) == NULL) {
+               charset = "ASCII";
+       }
+       
+       if (g_ascii_strcasecmp (charset, "utf-8") == 0 || g_ascii_strcasecmp (charset, "utf8") == 0) {
+               return part_content;
+       }
+       
+       res_str = g_convert_with_fallback (part_content->data, part_content->len,
+                                                                         "UTF-8", charset, NULL,
+                                                                         &read_bytes, &write_bytes, &err);
+       if (res_str == NULL) {
+               msg_warn ("convert_text_to_utf: cannot convert from %s to utf8: %s", charset, err ? err->message : "unknown problem");
+               return part_content;
+       }
+
+       result_array = g_malloc (sizeof (GByteArray));
+       result_array->data = res_str;
+       result_array->len = write_bytes + 1;
+       memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, result_array);
+
+       return result_array;
+}
+
+static void
+process_text_part (struct worker_task *task, GByteArray *part_content, GMimeContentType *type)
+{
+       struct mime_text_part *text_part;
+
+       if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
+               msg_debug ("mime_foreach_callback: got urls from text/html part");
+               url_parse_html (task, part_content);
+
+               text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+               text_part->orig = convert_text_to_utf (task, part_content, type);
+               text_part->content = strip_html_tags (part_content, NULL);
+               text_part->is_html = TRUE;
+               text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+               task->text_parts = g_list_prepend (task->text_parts, text_part);
+       } 
+       else if (g_mime_content_type_is_type (type, "text", "plain")) {
+               msg_debug ("mime_foreach_callback: got urls from text/plain part");
+               url_parse_text (task, part_content);
+
+               text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
+               text_part->orig = convert_text_to_utf (task, part_content, type);
+               text_part->content = part_content;
+               text_part->is_html = FALSE;
+               text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+               task->text_parts = g_list_prepend (task->text_parts, text_part);
+       }
+}
+
  #ifdef GMIME24
  static void
  mime_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
@@ -244,7 +307,6 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
  {
         struct worker_task *task = (struct worker_task *)user_data;
         struct mime_part *mime_part;
-       struct mime_text_part *text_part;
         GMimeContentType *type;
         GMimeDataWrapper *wrapper;
         GMimeStream *part_stream;
@@ -310,30 +372,7 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
                                 task->parts = g_list_prepend (task->parts, mime_part);
                                 /* Skip empty parts */
                                 if (part_content->len > 0) {
-                                       /* Now do special processing for text parts of message */
-                                       if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
-                                               msg_debug ("mime_foreach_callback: got urls from text/html part");
-                                               url_parse_html (task, part_content);
-
-                                               text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
-                                               text_part->orig = part_content;
-                                               text_part->content = strip_html_tags (part_content, NULL);
-                                               text_part->is_html = TRUE;
-                                               text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
-                                               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
-                                               task->text_parts = g_list_prepend (task->text_parts, text_part);
-                                       } 
-                                       else if (g_mime_content_type_is_type (type, "text", "plain")) {
-                                               msg_debug ("mime_foreach_callback: got urls from text/plain part");
-                                               url_parse_text (task, part_content);
-
-                                               text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
-                                               text_part->orig = part_content;
-                                               text_part->content = part_content;
-                                               text_part->is_html = FALSE;
-                                               text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
-                                               task->text_parts = g_list_prepend (task->text_parts, text_part);
-                                       }
+                                       process_text_part (task, part_content, type);
                                 }
                         }
                         else {
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c

index 573d370f9b2ae4e1a943a0d7efe0e62c49db5558..a05e0e0e066e34c3280237fbd132e65d35b15e88 100644 (file)
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -174,7 +174,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
                         return 0;
                 case REGEXP_HEADER:
                         if (re->header == NULL) {
-                               msg_info ("process_regexp: header regexp without header name");
+                               msg_info ("process_regexp: header regexp without header name: '%s'", re->regexp_text);
                                 task_cache_add (task, re, 0);
                                 return 0;
                         }
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Wed, 15 Apr 2009 12:24:55 +0000 (16:24 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Wed, 15 Apr 2009 12:24:55 +0000 (16:24 +0400)
src/main.c		patch \| blob \| history
src/message.c		patch \| blob \| history
src/plugins/regexp.c		patch \| blob \| history