From 93e2e9d2faf9495283b2709f8749d81542d21ed1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 10 Dec 2008 20:03:00 +0300 Subject: [PATCH] * Add logic for filtering messages with tokenizer/classifier pair --- src/cfg_file.h | 5 +++ src/cfg_file.l | 2 + src/cfg_file.y | 48 ++++++++++++++++++++- src/controller.c | 20 +-------- src/filter.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++- src/util.h | 6 +++ 6 files changed, 168 insertions(+), 20 deletions(-) diff --git a/src/cfg_file.h b/src/cfg_file.h index f31efdc6c..4a2fb824d 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -51,6 +51,8 @@ fprintf (stderr, "\n") struct expression; +struct tokenizer; +struct classifier; enum { VAL_UNDEF=0, VAL_TRUE, VAL_FALSE }; @@ -98,7 +100,10 @@ struct statfile { char *alias; char *pattern; double weight; + char *metric; size_t size; + struct tokenizer *tokenizer; + struct classifier *classifier; }; struct config_file { diff --git a/src/cfg_file.l b/src/cfg_file.l index f9a90bb7d..7d940099f 100644 --- a/src/cfg_file.l +++ b/src/cfg_file.l @@ -55,6 +55,8 @@ alias return ALIAS; pattern return PATTERN; weight return WEIGHT; size return SIZE; +tokenizer return TOKENIZER; +classifier return CLASSIFIER; logging return LOGGING; diff --git a/src/cfg_file.y b/src/cfg_file.y index dc57df2a4..59d70b160 100644 --- a/src/cfg_file.y +++ b/src/cfg_file.y @@ -18,6 +18,8 @@ #include "cfg_file.h" #include "main.h" +#include "classifiers/classifiers.h" +#include "tokenizers/tokenizers.h" #define YYDEBUG 1 @@ -53,7 +55,7 @@ struct statfile *cur_statfile = NULL; %token REQUIRED_SCORE FUNCTION FRACT COMPOSITES CONTROL PASSWORD %token LOGGING LOG_TYPE LOG_TYPE_CONSOLE LOG_TYPE_SYSLOG LOG_TYPE_FILE %token LOG_LEVEL LOG_LEVEL_DEBUG LOG_LEVEL_INFO LOG_LEVEL_WARNING LOG_LEVEL_ERROR LOG_FACILITY LOG_FILENAME -%token STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE +%token STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE TOKENIZER CLASSIFIER %type STRING %type VARIABLE @@ -550,6 +552,15 @@ statfile: yyerror ("yyparse: not enough arguments in statfile definition"); YYERROR; } + if (cur_statfile->metric == NULL) { + cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default"); + } + if (cur_statfile->classifier == NULL) { + cur_statfile->classifier = get_classifier ("winnow"); + } + if (cur_statfile->tokenizer == NULL) { + cur_statfile->tokenizer = get_tokenizer ("osb-text"); + } g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile); cur_statfile = NULL; } @@ -565,6 +576,9 @@ statfilecmd: | statfilepattern | statfileweight | statfilesize + | statfilemetric + | statfiletokenizer + | statfileclassifier ; statfilealias: @@ -615,6 +629,38 @@ statfilesize: } ; +statfilemetric: + METRIC EQSIGN QUOTEDSTRING { + if (cur_statfile == NULL) { + cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); + } + cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3); + } + ; + +statfiletokenizer: + TOKENIZER EQSIGN QUOTEDSTRING { + if (cur_statfile == NULL) { + cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); + } + if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) { + yyerror ("yyparse: unknown tokenizer %s", $3); + YYERROR; + } + } + ; + +statfileclassifier: + CLASSIFIER EQSIGN QUOTEDSTRING { + if (cur_statfile == NULL) { + cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); + } + if ((cur_statfile->classifier = get_classifier ($3)) == NULL) { + yyerror ("yyparse: unknown classifier %s", $3); + YYERROR; + } + } + ; statfile_pool_size: STATFILE_POOL_SIZE EQSIGN SIZELIMIT { diff --git a/src/controller.c b/src/controller.c index fa2fa268f..4369f7d7f 100644 --- a/src/controller.c +++ b/src/controller.c @@ -235,8 +235,8 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control session->learn_rcpt = NULL; session->learn_from = NULL; session->learn_filename = NULL; - session->learn_tokenizer = get_tokenizer ("osb-text"); - session->learn_classifier = get_classifier ("winnow"); + session->learn_tokenizer = statfile->tokenizer; + session->learn_classifier = statfile->classifier; /* By default learn positive */ session->in_class = 1; /* Get all arguments */ @@ -262,22 +262,6 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control } session->learn_from = memory_pool_strdup (session->session_pool, arg); break; - case 't': - arg = *(cmd_args + 1); - if (!arg || *arg == '\0' || (session->learn_tokenizer = get_tokenizer (arg)) == NULL) { - r = snprintf (out_buf, sizeof (out_buf), "tokenizer is not defined" CRLF, arg); - bufferevent_write (session->bev, out_buf, r); - return; - } - break; - case 'c': - arg = *(cmd_args + 1); - if (!arg || *arg == '\0' || (session->learn_classifier = get_classifier (arg)) == NULL) { - r = snprintf (out_buf, sizeof (out_buf), "classifier is not defined" CRLF, arg); - bufferevent_write (session->bev, out_buf, r); - return; - } - break; case 'n': session->in_class = 0; break; diff --git a/src/filter.c b/src/filter.c index ae5852efb..5bf0db27a 100644 --- a/src/filter.c +++ b/src/filter.c @@ -8,6 +8,9 @@ #include "main.h" #include "cfg_file.h" #include "perl.h" +#include "util.h" +#include "classifiers/classifiers.h" +#include "tokenizers/tokenizers.h" void insert_result (struct worker_task *task, const char *metric_name, const char *symbol, u_char flag) @@ -330,11 +333,113 @@ composites_metric_callback (gpointer key, gpointer value, void *data) g_hash_table_foreach (task->cfg->composite_symbols, composites_foreach_callback, cd); } -void make_composites (struct worker_task *task) +void +make_composites (struct worker_task *task) { g_hash_table_foreach (task->results, composites_metric_callback, task); } +struct statfile_callback_data { + GHashTable *metrics; + GHashTable *tokens; + struct worker_task *task; +}; + +static void +statfiles_callback (gpointer key, gpointer value, void *arg) +{ + struct statfile_callback_data *data= (struct statfile_callback_data *)arg; + struct worker_task *task = data->task; + struct statfile *st = (struct statfile *)value; + GTree *tokens; + char *filename; + double weight, *w; + + if (g_list_length (task->rcpt) == 1) { + filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data); + } + else { + /* XXX: handle multiply recipients correctly */ + filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, ""); + } + + if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == -1) { + return; + } + + if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) { + /* Tree would be freed at task pool freeing */ + tokens = st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, task->msg->buf); + if (tokens == NULL) { + msg_info ("statfiles_callback: cannot tokenize input"); + return; + } + g_hash_table_insert (data->tokens, st->tokenizer, tokens); + } + + weight = st->classifier->classify_func (task->worker->srv->statfile_pool, filename, tokens); + + if (weight > 0.000001) { + if ((w = g_hash_table_lookup (data->metrics, st->metric)) == NULL) { + w = memory_pool_alloc (task->task_pool, sizeof (double)); + *w = weight * st->weight; + g_hash_table_insert (data->metrics, st->metric, w); + } + else { + *w += weight * st->weight; + } + } + +} + +static void +statfiles_results_callback (gpointer key, gpointer value, void *arg) +{ + struct worker_task *task = (struct worker_task *)arg; + struct metric_result *metric_res; + struct metric *metric; + double w; + + metric_res = g_hash_table_lookup (task->results, (char *)key); + w = *(double *)value; + + metric = g_hash_table_lookup (task->worker->srv->cfg->metrics, (char *)key); + if (metric == NULL) { + return; + } + + if (metric_res == NULL) { + /* Create new metric chain */ + metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result)); + metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_hash_table_destroy, metric_res->symbols); + metric_res->metric = metric; + metric_res->score = w; + g_hash_table_insert (task->results, key, metric_res); + } + else { + metric_res->score += w; + } + g_hash_table_insert (metric_res->symbols, key, GSIZE_TO_POINTER (1)); + +} + + +void +process_statfiles (struct worker_task *task) +{ + struct statfile_callback_data cd; + + cd.task = task; + cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal); + cd.metrics = g_hash_table_new (g_str_hash, g_str_equal); + + g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd); + g_hash_table_foreach (cd.metrics, statfiles_results_callback, task); + + g_hash_table_destroy (cd.tokens); + g_hash_table_destroy (cd.metrics); +} /* * vi:ts=4 diff --git a/src/util.h b/src/util.h index 9c05a2a27..5d4590c4c 100644 --- a/src/util.h +++ b/src/util.h @@ -42,7 +42,13 @@ int setproctitle(const char *fmt, ...); #ifndef HAVE_PIDFILE struct pidfh { int pf_fd; +#ifdef HAVE_PATH_MAX + char pf_path[PATH_MAX + 1]; +#elif defined(HAVE_MAXPATHLEN) char pf_path[MAXPATHLEN + 1]; +#else + char pf_path[1024 + 1]; +#endif __dev_t pf_dev; ino_t pf_ino; }; -- 2.39.5