From 27360c622541db1cf27dc5bef39524ca912b0e3d Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 3 Jul 2009 18:59:32 +0400 Subject: [PATCH] * Add autolearn config options * Fix parsing of invalid urls in html parser * Add ability to specify symbols in view parameter as comma-separated list --- src/cfg_file.h | 14 +++++++++ src/cfg_file.l | 3 ++ src/cfg_file.y | 77 +++++++++++++++++++++++++++++++++++++++++++++++++ src/cfg_utils.c | 28 ++++++++++++++++++ src/filter.c | 2 +- src/html.c | 2 +- src/view.c | 8 ++++- 7 files changed, 131 insertions(+), 3 deletions(-) diff --git a/src/cfg_file.h b/src/cfg_file.h index de3003306..850c34ece 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -116,6 +116,16 @@ struct statfile_section { double weight; /**< weight coefficient for section */ }; +/** + * Statfile autolearn parameters + */ +struct statfile_autolearn_params { + const char *metric; /**< metric name for autolearn triggering */ + double threshold_min; /**< threshold mark */ + double threshold_max; /**< threshold mark */ + GList *symbols; /**< list of symbols */ +}; + /** * Statfile config definition */ @@ -127,6 +137,7 @@ struct statfile { size_t size; /**< size of statfile */ struct tokenizer *tokenizer; /**< tokenizer used for statfile */ GList *sections; /**< list of sections in statfile */ + struct statfile_autolearn_params *autolearn; /**< autolearn params */ }; /** @@ -304,6 +315,9 @@ void post_load_config (struct config_file *cfg); */ void unescape_quotes (char *line); +GList* parse_comma_list (memory_pool_t *pool, char *line); + + int yylex (void); int yyparse (void); void yyrestart (FILE *); diff --git a/src/cfg_file.l b/src/cfg_file.l index a2589441e..0cf635f7e 100644 --- a/src/cfg_file.l +++ b/src/cfg_file.l @@ -77,6 +77,9 @@ size return SIZE; tokenizer return TOKENIZER; classifier return CLASSIFIER; section return SECTION; +autolearn return AUTOLEARN; +min_mark return MIN_MARK; +max_mark return MAX_MARK; logging return LOGGING; diff --git a/src/cfg_file.y b/src/cfg_file.y index 17854f22b..1fdc7275f 100644 --- a/src/cfg_file.y +++ b/src/cfg_file.y @@ -24,6 +24,7 @@ GList *cur_module_opt = NULL; struct metric *cur_metric = NULL; struct statfile *cur_statfile = NULL; struct statfile_section *cur_section = NULL; +struct statfile_autolearn_params *cur_autolearn = NULL; struct worker_conf *cur_worker = NULL; struct rspamd_view *cur_view = NULL; @@ -55,6 +56,7 @@ struct rspamd_view *cur_view = NULL; %token STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE TOKENIZER CLASSIFIER %token DELIVERY LMTP ENABLED AGENT SECTION LUACODE RAW_MODE PROFILE_FILE COUNT %token VIEW IP FROM SYMBOLS +%token AUTOLEARN MIN_MARK MAX_MARK %type STRING %type VARIABLE @@ -672,6 +674,7 @@ statfilecmd: | statfilemetric | statfiletokenizer | statfilesection + | statfileautolearn ; statfilealias: @@ -807,6 +810,80 @@ sectionweight: } ; +statfileautolearn: + AUTOLEARN OBRACE autolearnbody EBRACE { + if (cur_statfile == NULL) { + cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); + } + if (cur_autolearn == NULL) { + yyerror ("yyparse: error in autolearn definition"); + YYERROR; + } + cur_statfile->autolearn = cur_autolearn; + cur_autolearn = NULL; + } + ; + +autolearnbody: + autolearncmd SEMICOLON + | autolearnbody autolearncmd SEMICOLON + ; + +autolearncmd: + autolearnmetric + | autolearnmin + | autolearnmax + | autolearnsymbols + ; + +autolearnmetric: + METRIC EQSIGN QUOTEDSTRING { + if (cur_autolearn == NULL) { + cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params)); + } + cur_autolearn->metric = memory_pool_strdup (cfg->cfg_pool, $3); + } + ; + +autolearnmin: + MIN_MARK EQSIGN NUMBER { + if (cur_autolearn == NULL) { + cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params)); + } + cur_autolearn->threshold_min = $3; + } + | MIN_MARK EQSIGN FRACT { + if (cur_autolearn == NULL) { + cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params)); + } + cur_autolearn->threshold_min = $3; + } + ; + +autolearnmax: + MAX_MARK EQSIGN NUMBER { + if (cur_autolearn == NULL) { + cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params)); + } + cur_autolearn->threshold_max = $3; + } + | MAX_MARK EQSIGN FRACT { + if (cur_autolearn == NULL) { + cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params)); + } + cur_autolearn->threshold_max = $3; + } + ; + +autolearnsymbols: + SYMBOLS EQSIGN QUOTEDSTRING { + if (cur_autolearn == NULL) { + cur_autolearn = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile_autolearn_params)); + } + cur_autolearn->symbols = parse_comma_list (cfg->cfg_pool, $3); + } + ; + statfile_pool_size: STATFILE_POOL_SIZE EQSIGN SIZELIMIT { cfg->max_statfile_size = $3; diff --git a/src/cfg_utils.c b/src/cfg_utils.c index afb15a652..900e55f1b 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -586,6 +586,34 @@ unescape_quotes (char *line) } } +GList * +parse_comma_list (memory_pool_t *pool, char *line) +{ + GList *res = NULL; + char *c, *p, *str; + + c = line; + p = c; + + while (*p) { + if (*p == ',' && *c != *p) { + str = memory_pool_alloc (pool, p - c + 1); + g_strlcpy (str, c, p - c + 1); + res = g_list_prepend (res, str); + /* Skip spaces */ + while (g_ascii_isspace (*(++p))); + c = p; + continue; + } + p ++; + } + if (res != NULL) { + memory_pool_add_destructor (pool, (pool_destruct_func)g_list_free, res); + } + + return res; +} + /* * vi:ts=4 */ diff --git a/src/filter.c b/src/filter.c index b34a88003..daa9b0e29 100644 --- a/src/filter.c +++ b/src/filter.c @@ -498,7 +498,7 @@ statfiles_callback (gpointer key, gpointer value, void *arg) filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, ""); } - if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == -1) { + if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL) { return; } diff --git a/src/html.c b/src/html.c index 5b3552c7f..bdd82681f 100644 --- a/src/html.c +++ b/src/html.c @@ -303,7 +303,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i url = memory_pool_alloc (task->task_pool, sizeof (struct uri)); rc = parse_uri (url, url_text, task->task_pool); - if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { + if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) { if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) { g_tree_insert (part->html_urls, url_text, url); task->urls = g_list_prepend (task->urls, url); diff --git a/src/view.c b/src/view.c index aeec8cd51..0bd534b32 100644 --- a/src/view.c +++ b/src/view.c @@ -68,6 +68,7 @@ gboolean add_view_symbols (struct rspamd_view *view, char *line) { struct rspamd_regexp *re = NULL; + GList *symbols; if (g_ascii_strncasecmp (line, "file://", sizeof ("file://") - 1) == 0) { if (parse_host_list (view->pool, view->symbols_hash, line + sizeof ("file://") - 1)) { @@ -80,7 +81,12 @@ add_view_symbols (struct rspamd_view *view, char *line) } else { /* Try to parse symbols line as comma separated list */ - + symbols = parse_comma_list (view->pool, line); + while (symbols) { + g_hash_table_insert (view->symbols_hash, (char *)symbols->data, symbols->data); + /* Symbols list would be free at pool destruction */ + symbols = g_list_next (symbols); + } } return FALSE; -- 2.39.5