* Validate utf8 chars to avoid crashes

author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-04-24 15:23:41 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-04-24 15:23:41 +0400
commit: 521c2b24b92d2085629d0e34d18110b3a643a77a (patch)
tree: 44a264c37e7b0fd6e021cb4a99189f95fd274dd8
parent: 0cc688fe0be5662e761639d853745153a13522f2 (diff)
download: rspamd-521c2b24b92d2085629d0e34d18110b3a643a77a.tar.gz
rspamd-521c2b24b92d2085629d0e34d18110b3a643a77a.zip
4 files changed, 69 insertions, 3 deletions
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index fe2ed858e..d05912f57 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -142,14 +142,23 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
 	}
 	else {
 		while (remain > 0) {
-			c = g_utf8_get_char (p);
+			c = g_utf8_get_char_validated (p, remain);
+			if (c == (gunichar)-2 || c == (gunichar)-1) {
+				/* Invalid characters detected, stop processing*/
+				return FALSE;
+			}
+
 			scc = g_unichar_get_script (c);
 			p1 = g_utf8_next_char (p);
 			remain -= p1 - p;
 			p = p1;
 			
 			if (remain > 0) {
-				t = g_utf8_get_char (p);
+				t = g_utf8_get_char_validated (p, remain);
+				if (c == (gunichar)-2 || c == (gunichar)-1) {
+					/* Invalid characters detected, stop processing*/
+					return FALSE;
+				}
 				sct = g_unichar_get_script (t);
 				if (g_unichar_isalnum (c) && g_unichar_isalnum (t)) {
 					/* We have two unicode alphanumeric characters, so we can check its script */
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index a0d7e1f98..e3e3853da 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -36,18 +36,28 @@
 #include "../cfg_file.h"
 #include "../expressions.h"
 
+#define DEFAULT_STATFILE_PREFIX "./"
+
 struct regexp_module_item {
 	struct expression *expr;
 	char *symbol;
 };
 
+struct autolearn_data {
+	char *statfile_name;
+	char *symbol;
+	float weight;	
+};
+
 struct regexp_ctx {
 	int (*header_filter)(struct worker_task *task);
 	int (*mime_filter)(struct worker_task *task);
 	int (*message_filter)(struct worker_task *task);
 	int (*url_filter)(struct worker_task *task);
 	GList *items;
+	GHashTable *autolearn_symbols;
 	char *metric;
+	char *statfile_prefix;
 
 	memory_pool_t *regexp_pool;
 };
@@ -68,6 +78,7 @@ regexp_module_init (struct config_file *cfg, struct module_ctx **ctx)
 	regexp_module_ctx->url_filter = NULL;
 	regexp_module_ctx->regexp_pool = memory_pool_new (1024);
 	regexp_module_ctx->items = NULL;
+	regexp_module_ctx->autolearn_symbols = g_hash_table_new (g_str_hash, g_str_equal);
 
 	*ctx = (struct module_ctx *)regexp_module_ctx;
 	register_expression_function ("regexp_match_number", rspamd_regexp_match_number);
@@ -102,6 +113,37 @@ read_regexp_expression (memory_pool_t *pool, struct regexp_module_item *chain, c
 	return TRUE;
 }
 
+/* 
+ * Parse string in format:
+ * SYMBOL:statfile:weight
+ */
+void
+parse_autolearn_param (const char *param, const char *value, struct config_file *cfg)
+{
+	struct autolearn_data *d;
+	char *p;
+
+	p = memory_pool_strdup (regexp_module_ctx->regexp_pool, value);
+	d = memory_pool_alloc (regexp_module_ctx->regexp_pool, sizeof (struct autolearn_data));
+
+	d->symbol = strsep (&p, ":");
+	if (d->symbol) {
+		d->statfile_name = strsep (&p, ":");
+		if (d->statfile_name) {
+			if (p != NULL && *p != '\0') {
+				d->weight = strtod (p, NULL);
+				g_hash_table_insert (regexp_module_ctx->autolearn_symbols, d->symbol, d);
+			}
+		}
+		else {
+			msg_warn ("parse_autolearn_param: cannot extract statfile name from %s", p);
+		}
+	}
+	else {
+		msg_warn ("parse_autolearn_param: cannot extract symbol name from %s", p);
+	}
+}
+
 int
 regexp_module_config (struct config_file *cfg)
 {
@@ -118,11 +160,22 @@ regexp_module_config (struct config_file *cfg)
 	else {
 		regexp_module_ctx->metric = DEFAULT_METRIC;
 	}
+	if ((value = get_module_opt (cfg, "regexp", "statfile_prefix")) != NULL) {
+		regexp_module_ctx->statfile_prefix = memory_pool_strdup (regexp_module_ctx->regexp_pool, value);
+		g_free (value);
+	}
+	else {
+		regexp_module_ctx->metric = DEFAULT_STATFILE_PREFIX;
+	}
 
 	cur_module_opt = g_hash_table_lookup (cfg->modules_opts, "regexp");
 	if (cur_module_opt != NULL) {
 		LIST_FOREACH (cur, cur_module_opt, next) {
-			if (strcmp (cur->param, "metric") == 0) {
+			if (strcmp (cur->param, "metric") == 0 || strcmp (cur->param, "statfile_prefix") == 0) {
+				continue;
+			}
+			else if (g_ascii_strncasecmp (cur->param, "autolearn", sizeof ("autolearn") - 1)) {
+				parse_autolearn_param (cur->param, cur->value, cfg);
 				continue;
 			}
 			cur_item = memory_pool_alloc0 (regexp_module_ctx->regexp_pool, sizeof (struct regexp_module_item));
diff --git a/src/statfile.c b/src/statfile.c
index 8537a054e..e41c1af15 100644
--- a/src/statfile.c
+++ b/src/statfile.c
@@ -533,6 +533,9 @@ statfile_get_section_by_name (const char *name)
 	else if (g_ascii_strcasecmp (name, "url") == 0) {
 		return STATFILE_SECTION_URLS;
 	}
+	else if (g_ascii_strcasecmp (name, "regexp") == 0) {
+		return STATFILE_SECTION_REGEXP;
+	}
 
 	return 0;
 }
diff --git a/src/statfile.h b/src/statfile.h
index ee89acda9..39537944a 100644
--- a/src/statfile.h
+++ b/src/statfile.h
@@ -16,6 +16,7 @@
 #define STATFILE_SECTION_COMMON 1
 #define STATFILE_SECTION_HEADERS 2
 #define STATFILE_SECTION_URLS 3
+#define STATFILE_SECTION_REGEXP 4
 
 /**
  * Common statfile header
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-04-24 15:23:41 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-04-24 15:23:41 +0400
commit	521c2b24b92d2085629d0e34d18110b3a643a77a (patch)
tree	44a264c37e7b0fd6e021cb4a99189f95fd274dd8
parent	0cc688fe0be5662e761639d853745153a13522f2 (diff)
download	rspamd-521c2b24b92d2085629d0e34d18110b3a643a77a.tar.gz rspamd-521c2b24b92d2085629d0e34d18110b3a643a77a.zip