]> source.dussan.org Git - rspamd.git/commitdiff
* Add logic for filtering messages with tokenizer/classifier pair
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 10 Dec 2008 17:03:00 +0000 (20:03 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Wed, 10 Dec 2008 17:03:00 +0000 (20:03 +0300)
src/cfg_file.h
src/cfg_file.l
src/cfg_file.y
src/controller.c
src/filter.c
src/util.h

index f31efdc6cbf1b4923f544d84503e19e1f56975bb..4a2fb824d141072813b508eecef76d17efe457ef 100644 (file)
@@ -51,6 +51,8 @@
                fprintf (stderr, "\n")
 
 struct expression;
+struct tokenizer;
+struct classifier;
 
 enum { VAL_UNDEF=0, VAL_TRUE, VAL_FALSE };
 
@@ -98,7 +100,10 @@ struct statfile {
        char *alias;
        char *pattern;
        double weight;
+       char *metric;
        size_t size;
+       struct tokenizer *tokenizer;
+       struct classifier *classifier;
 };
 
 struct config_file {
index f9a90bb7dbfe0ab2be117d5d177d8cc602773dd8..7d940099f3f2c4eb29b1865401915a4241bb3c52 100644 (file)
@@ -55,6 +55,8 @@ alias                                                 return ALIAS;
 pattern                                                        return PATTERN;
 weight                                                 return WEIGHT;
 size                                                   return SIZE;
+tokenizer                                              return TOKENIZER;
+classifier                                             return CLASSIFIER;
 
 logging                                                        return LOGGING;
 
index dc57df2a4cce51f4010686456f66847a91d0341c..59d70b160b6d81189e16016db468ccdc75b783ca 100644 (file)
@@ -18,6 +18,8 @@
 
 #include "cfg_file.h"
 #include "main.h"
+#include "classifiers/classifiers.h"
+#include "tokenizers/tokenizers.h"
 
 #define YYDEBUG 1
 
@@ -53,7 +55,7 @@ struct statfile *cur_statfile = NULL;
 %token  REQUIRED_SCORE FUNCTION FRACT COMPOSITES CONTROL PASSWORD
 %token  LOGGING LOG_TYPE LOG_TYPE_CONSOLE LOG_TYPE_SYSLOG LOG_TYPE_FILE
 %token  LOG_LEVEL LOG_LEVEL_DEBUG LOG_LEVEL_INFO LOG_LEVEL_WARNING LOG_LEVEL_ERROR LOG_FACILITY LOG_FILENAME
-%token  STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE
+%token  STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE TOKENIZER CLASSIFIER
 
 %type  <string>        STRING
 %type  <string>        VARIABLE
@@ -550,6 +552,15 @@ statfile:
                        yyerror ("yyparse: not enough arguments in statfile definition");
                        YYERROR;
                }
+               if (cur_statfile->metric == NULL) {
+                       cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default");
+               }
+               if (cur_statfile->classifier == NULL) {
+                       cur_statfile->classifier = get_classifier ("winnow");
+               }
+               if (cur_statfile->tokenizer == NULL) {
+                       cur_statfile->tokenizer = get_tokenizer ("osb-text");
+               }
                g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile);
                cur_statfile = NULL;
        }
@@ -565,6 +576,9 @@ statfilecmd:
        | statfilepattern
        | statfileweight
        | statfilesize
+       | statfilemetric
+       | statfiletokenizer
+       | statfileclassifier
        ;
        
 statfilealias:
@@ -615,6 +629,38 @@ statfilesize:
        }
        ;
 
+statfilemetric:
+       METRIC EQSIGN QUOTEDSTRING {
+               if (cur_statfile == NULL) {
+                       cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
+               }
+               cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3);
+       }
+       ;
+
+statfiletokenizer:
+       TOKENIZER EQSIGN QUOTEDSTRING {
+               if (cur_statfile == NULL) {
+                       cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
+               }
+               if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) {
+                       yyerror ("yyparse: unknown tokenizer %s", $3);
+                       YYERROR;
+               }
+       }
+       ;
+
+statfileclassifier:
+       CLASSIFIER EQSIGN QUOTEDSTRING {
+               if (cur_statfile == NULL) {
+                       cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
+               }
+               if ((cur_statfile->classifier = get_classifier ($3)) == NULL) {
+                       yyerror ("yyparse: unknown classifier %s", $3);
+                       YYERROR;
+               }
+       }
+       ;
 
 statfile_pool_size:
        STATFILE_POOL_SIZE EQSIGN SIZELIMIT {
index fa2fa268f96c6ba6a9ea30a37d02f7b9cc3d767f..4369f7d7fa5e28fd5142d81e7f8d94f1897aac8d 100644 (file)
@@ -235,8 +235,8 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
                                session->learn_rcpt = NULL;
                                session->learn_from = NULL;
                                session->learn_filename = NULL;
-                               session->learn_tokenizer = get_tokenizer ("osb-text");
-                               session->learn_classifier = get_classifier ("winnow");
+                               session->learn_tokenizer = statfile->tokenizer;
+                               session->learn_classifier = statfile->classifier;
                                /* By default learn positive */
                                session->in_class = 1;
                                /* Get all arguments */
@@ -262,22 +262,6 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
                                                                }
                                                                session->learn_from = memory_pool_strdup (session->session_pool, arg);
                                                                break;
-                                                       case 't':
-                                                               arg = *(cmd_args + 1);
-                                                               if (!arg || *arg == '\0' || (session->learn_tokenizer = get_tokenizer (arg)) == NULL) {
-                                                                       r = snprintf (out_buf, sizeof (out_buf), "tokenizer is not defined" CRLF, arg);
-                                                                       bufferevent_write (session->bev, out_buf, r);
-                                                                       return;
-                                                               }
-                                                               break;
-                                                       case 'c':
-                                                               arg = *(cmd_args + 1);
-                                                               if (!arg || *arg == '\0' || (session->learn_classifier = get_classifier (arg)) == NULL) {
-                                                                       r = snprintf (out_buf, sizeof (out_buf), "classifier is not defined" CRLF, arg);
-                                                                       bufferevent_write (session->bev, out_buf, r);
-                                                                       return;
-                                                               }
-                                                               break;
                                                        case 'n':
                                                                session->in_class = 0;
                                                                break;
index ae5852efb3c157ecc4a7518001c4abe7a32b7214..5bf0db27a0a316a86933430f0a3d3556fe582632 100644 (file)
@@ -8,6 +8,9 @@
 #include "main.h"
 #include "cfg_file.h"
 #include "perl.h"
+#include "util.h"
+#include "classifiers/classifiers.h"
+#include "tokenizers/tokenizers.h"
 
 void
 insert_result (struct worker_task *task, const char *metric_name, const char *symbol, u_char flag)
@@ -330,11 +333,113 @@ composites_metric_callback (gpointer key, gpointer value, void *data)
        g_hash_table_foreach (task->cfg->composite_symbols, composites_foreach_callback, cd);
 }
 
-void make_composites (struct worker_task *task)
+void 
+make_composites (struct worker_task *task)
 {
        g_hash_table_foreach (task->results, composites_metric_callback, task);
 }
 
+struct statfile_callback_data {
+       GHashTable *metrics;
+       GHashTable *tokens;
+       struct worker_task *task;
+};
+
+static void
+statfiles_callback (gpointer key, gpointer value, void *arg)
+{
+       struct statfile_callback_data *data= (struct statfile_callback_data *)arg;
+       struct worker_task *task = data->task;
+       struct statfile *st = (struct statfile *)value;
+       GTree *tokens;
+       char *filename;
+       double weight, *w;
+       
+       if (g_list_length (task->rcpt) == 1) {
+               filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data);
+       }
+       else {
+               /* XXX: handle multiply recipients correctly */
+               filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, "");
+       }
+       
+       if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == -1) {
+               return;
+       }
+       
+       if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
+               /* Tree would be freed at task pool freeing */
+               tokens = st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, task->msg->buf);
+               if (tokens == NULL) {
+                       msg_info ("statfiles_callback: cannot tokenize input");
+                       return;
+               }
+               g_hash_table_insert (data->tokens, st->tokenizer, tokens);
+       }
+       
+       weight = st->classifier->classify_func (task->worker->srv->statfile_pool, filename, tokens);
+       
+       if (weight > 0.000001) {
+               if ((w = g_hash_table_lookup (data->metrics, st->metric)) == NULL) {
+                       w = memory_pool_alloc (task->task_pool, sizeof (double));
+                       *w = weight * st->weight;
+                       g_hash_table_insert (data->metrics, st->metric, w);
+               }
+               else {
+                       *w += weight * st->weight;
+               }
+       }
+       
+}
+
+static void
+statfiles_results_callback (gpointer key, gpointer value, void *arg)
+{
+       struct worker_task *task = (struct worker_task *)arg;
+       struct metric_result *metric_res;
+       struct metric *metric;
+       double w;
+
+       metric_res = g_hash_table_lookup (task->results, (char *)key);
+       w = *(double *)value;
+
+       metric = g_hash_table_lookup (task->worker->srv->cfg->metrics, (char *)key);
+       if (metric == NULL) {
+               return;
+       }
+
+       if (metric_res == NULL) {
+               /* Create new metric chain */
+               metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result));
+               metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal);
+               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_hash_table_destroy, metric_res->symbols);
+               metric_res->metric = metric;
+               metric_res->score = w;
+               g_hash_table_insert (task->results, key, metric_res);
+       }
+       else {
+               metric_res->score += w;
+       }
+       g_hash_table_insert (metric_res->symbols, key, GSIZE_TO_POINTER (1));
+
+}
+
+
+void
+process_statfiles (struct worker_task *task)
+{
+       struct statfile_callback_data cd;
+       
+       cd.task = task;
+       cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal);
+       cd.metrics = g_hash_table_new (g_str_hash, g_str_equal);
+
+       g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd);
+       g_hash_table_foreach (cd.metrics, statfiles_results_callback, task);
+       
+       g_hash_table_destroy (cd.tokens);
+       g_hash_table_destroy (cd.metrics);
+}
 
 /* 
  * vi:ts=4 
index 9c05a2a27645b82c7f9c63c002935f0a5c40dedf..5d4590c4c987fde05db4530347f56a2188f63f9f 100644 (file)
@@ -42,7 +42,13 @@ int setproctitle(const char *fmt, ...);
 #ifndef HAVE_PIDFILE
 struct pidfh {
        int pf_fd;
+#ifdef HAVE_PATH_MAX
+       char    pf_path[PATH_MAX + 1];
+#elif defined(HAVE_MAXPATHLEN)
        char    pf_path[MAXPATHLEN + 1];
+#else
+       char    pf_path[1024 + 1];
+#endif
        __dev_t pf_dev;
        ino_t   pf_ino;
 };