* Add autolearn config options

* Fix parsing of invalid urls in html parser * Add ability to specify symbols in view parameter as comma-separated list
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-07-03 18:59:32 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2009-07-03 18:59:32 +0400
commit: 27360c622541db1cf27dc5bef39524ca912b0e3d (patch)
tree: 7dd4d737853d4fe393f8dfaa094a24da204997a2
parent: ad56efc14e371b6a452c1ccc46aa68d800125468 (diff)
download: rspamd-27360c622541db1cf27dc5bef39524ca912b0e3d.tar.gz
rspamd-27360c622541db1cf27dc5bef39524ca912b0e3d.zip
7 files changed, 131 insertions, 3 deletions
diff --git a/src/cfg_file.h b/src/cfg_file.h
index de3003306..850c34ece 100644
--- a/src/cfg_file.h
+++ b/src/cfg_file.h
@@ -117,6 +117,16 @@ struct statfile_section {
 };
 
 /**
+ * Statfile autolearn parameters
+ */
+struct statfile_autolearn_params {
+	const char *metric;								/**< metric name for autolearn triggering 				*/
+	double threshold_min;							/**< threshold mark										*/
+	double threshold_max;							/**< threshold mark										*/
+	GList *symbols;									/**< list of symbols									*/
+};
+
+/**
  * Statfile config definition
  */
 struct statfile {
@@ -127,6 +137,7 @@ struct statfile {
 	size_t size;									/**< size of statfile									*/
 	struct tokenizer *tokenizer;					/**< tokenizer used for statfile						*/
 	GList *sections;								/**< list of sections in statfile						*/
+	struct statfile_autolearn_params *autolearn;	/**< autolearn params									*/
 };
 
 /**
@@ -304,6 +315,9 @@ void post_load_config (struct config_file *cfg);
  */
 void unescape_quotes (char *line);
 
+GList* parse_comma_list (memory_pool_t *pool, char *line);
+
+
 int yylex (void);
 int yyparse (void);
 void yyrestart (FILE *);
diff --git a/src/cfg_file.l b/src/cfg_file.l
index a2589441e..0cf635f7e 100644
--- a/src/cfg_file.l
+++ b/src/cfg_file.l
@@ -77,6 +77,9 @@ size							return SIZE;
 tokenizer						return TOKENIZER;
 classifier						return CLASSIFIER;
 section							return SECTION;
+autolearn						return AUTOLEARN;
+min_mark						return MIN_MARK;
+max_mark						return MAX_MARK;
 
 logging							return LOGGING;
 
diff --git a/src/cfg_file.y b/src/cfg_file.y
index 17854f22b..1fdc7275f 100644
--- a/src/cfg_file.y
+++ b/src/cfg_file.y
@@ -24,6 +24,7 @@ GList *cur_module_opt = NULL;
 struct metric *cur_metric = NULL;
 struct statfile *cur_statfile = NULL;
 struct statfile_section *cur_section = NULL;
+struct statfile_autolearn_params *cur_autolearn = NULL;
 struct worker_conf *cur_worker = NULL;
 
 struct rspamd_view *cur_view = NULL;
@@ -55,6 +56,7 @@ struct rspamd_view *cur_view = NULL;
 %token  STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE TOKENIZER CLASSIFIER
 %token	DELIVERY LMTP ENABLED AGENT SECTION LUACODE RAW_MODE PROFILE_FILE COUNT
 %token  VIEW IP FROM SYMBOLS
+%token  AUTOLEARN MIN_MARK MAX_MARK
 
 %type	<string>	STRING
 %type	<string>	VARIABLE
@@ -672,6 +674,7 @@ statfilecmd:
 	| statfilemetric
 	| statfiletokenizer
 	| statfilesection
+	| statfileautolearn
 	;
 	
 statfilealias:
@@ -807,6 +810,80 @@ sectionweight:
 	}
 	;
 
+statfileautolearn:
+	AUTOLEARN OBRACE autolearnbody EBRACE {
+		if (cur_statfile == NULL) {
+			cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
+		}
+		if (cur_autolearn == NULL) {
+			yyerror ("yyparse: error in autolearn definition");
+			YYERROR;
+		}
+		cur_statfile->autolearn = cur_autolearn;
+		cur_autolearn = NULL;
+	}
+	;
+
+autolearnbody:
+	autolearncmd SEMICOLON
+	| autolearnbody autolearncmd SEMICOLON
+	;
+
+autolearncmd:
+	autolearnmetric
+	| autolearnmin
+	| autolearnmax
+	| autolearnsymbols
+	;
+
+autolearnmetric:
+	METRIC EQSIGN QUOTEDSTRING {
+		if (cur_autolearn == NULL) {
+			cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params));
+		}
+		cur_autolearn->metric = memory_pool_strdup (cfg->cfg_pool, $3);
+	}
+	;
+
+autolearnmin:
+	MIN_MARK EQSIGN NUMBER {
+		if (cur_autolearn == NULL) {
+			cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params));
+		}
+		cur_autolearn->threshold_min = $3;
+	}
+	| MIN_MARK EQSIGN FRACT {
+		if (cur_autolearn == NULL) {
+			cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params));
+		}
+		cur_autolearn->threshold_min = $3;
+	}
+	;
+
+autolearnmax:
+	MAX_MARK EQSIGN NUMBER {
+		if (cur_autolearn == NULL) {
+			cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params));
+		}
+		cur_autolearn->threshold_max = $3;
+	}
+	| MAX_MARK EQSIGN FRACT {
+		if (cur_autolearn == NULL) {
+			cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params));
+		}
+		cur_autolearn->threshold_max = $3;
+	}
+	;
+
+autolearnsymbols:
+	SYMBOLS EQSIGN QUOTEDSTRING {
+		if (cur_autolearn == NULL) {
+			cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params));
+		}
+		cur_autolearn->symbols = parse_comma_list (cfg->cfg_pool, $3);
+	}
+	;
+
 statfile_pool_size:
 	STATFILE_POOL_SIZE EQSIGN SIZELIMIT {
 		cfg->max_statfile_size = $3;
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index afb15a652..900e55f1b 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -586,6 +586,34 @@ unescape_quotes (char *line)
 	}
 }
 
+GList *
+parse_comma_list (memory_pool_t *pool, char *line)
+{
+	GList *res = NULL;
+	char *c, *p, *str;
+	
+	c = line;
+	p = c;
+
+	while (*p) {
+		if (*p == ',' && *c != *p) {
+			str = memory_pool_alloc (pool, p - c + 1);
+			g_strlcpy (str, c, p - c + 1);
+			res = g_list_prepend (res, str);
+			/* Skip spaces */
+			while (g_ascii_isspace (*(++p)));
+			c = p;
+			continue;
+		}
+		p ++;
+	}
+	if (res != NULL) {
+		memory_pool_add_destructor (pool, (pool_destruct_func)g_list_free, res);
+	}
+
+	return res;
+}
+
 /*
  * vi:ts=4
  */
diff --git a/src/filter.c b/src/filter.c
index b34a88003..daa9b0e29 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -498,7 +498,7 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
 		filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, "");
 	}
 	
-	if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == -1) {
+	if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL) {
 		return;
 	}
 	
diff --git a/src/html.c b/src/html.c
index 5b3552c7f..bdd82681f 100644
--- a/src/html.c
+++ b/src/html.c
@@ -303,7 +303,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
 		url = memory_pool_alloc (task->task_pool, sizeof (struct uri));
 		rc = parse_uri (url, url_text, task->task_pool);
 
-		if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+		if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) {
 			if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
 				g_tree_insert (part->html_urls, url_text, url);
 				task->urls = g_list_prepend (task->urls, url);
diff --git a/src/view.c b/src/view.c
index aeec8cd51..0bd534b32 100644
--- a/src/view.c
+++ b/src/view.c
@@ -68,6 +68,7 @@ gboolean
 add_view_symbols (struct rspamd_view *view, char *line)
 {
 	struct rspamd_regexp *re = NULL;
+	GList *symbols;
 
 	if (g_ascii_strncasecmp (line, "file://", sizeof ("file://") - 1) == 0) {
 		if (parse_host_list (view->pool, view->symbols_hash, line + sizeof ("file://") - 1)) {
@@ -80,7 +81,12 @@ add_view_symbols (struct rspamd_view *view, char *line)
 	}
 	else {
 		/* Try to parse symbols line as comma separated list */
-		
+		symbols = parse_comma_list (view->pool, line);
+		while (symbols) {
+			g_hash_table_insert (view->symbols_hash, (char *)symbols->data, symbols->data);
+			/* Symbols list would be free at pool destruction */
+			symbols = g_list_next (symbols);
+		}
 	}
 
 	return FALSE;
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-07-03 18:59:32 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2009-07-03 18:59:32 +0400
commit	27360c622541db1cf27dc5bef39524ca912b0e3d (patch)
tree	7dd4d737853d4fe393f8dfaa094a24da204997a2
parent	ad56efc14e371b6a452c1ccc46aa68d800125468 (diff)
download	rspamd-27360c622541db1cf27dc5bef39524ca912b0e3d.tar.gz rspamd-27360c622541db1cf27dc5bef39524ca912b0e3d.zip