aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-10 20:03:00 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-12-10 20:03:00 +0300
commit93e2e9d2faf9495283b2709f8749d81542d21ed1 (patch)
tree72817ab774cd081b2687c914f08b2bcd96432175 /src
parentc9df6177c0b5f8fb5bd2c6c4947c52e184b0b556 (diff)
downloadrspamd-93e2e9d2faf9495283b2709f8749d81542d21ed1.tar.gz
rspamd-93e2e9d2faf9495283b2709f8749d81542d21ed1.zip
* Add logic for filtering messages with tokenizer/classifier pair
Diffstat (limited to 'src')
-rw-r--r--src/cfg_file.h5
-rw-r--r--src/cfg_file.l2
-rw-r--r--src/cfg_file.y48
-rw-r--r--src/controller.c20
-rw-r--r--src/filter.c107
-rw-r--r--src/util.h6
6 files changed, 168 insertions, 20 deletions
diff --git a/src/cfg_file.h b/src/cfg_file.h
index f31efdc6c..4a2fb824d 100644
--- a/src/cfg_file.h
+++ b/src/cfg_file.h
@@ -51,6 +51,8 @@
fprintf (stderr, "\n")
struct expression;
+struct tokenizer;
+struct classifier;
enum { VAL_UNDEF=0, VAL_TRUE, VAL_FALSE };
@@ -98,7 +100,10 @@ struct statfile {
char *alias;
char *pattern;
double weight;
+ char *metric;
size_t size;
+ struct tokenizer *tokenizer;
+ struct classifier *classifier;
};
struct config_file {
diff --git a/src/cfg_file.l b/src/cfg_file.l
index f9a90bb7d..7d940099f 100644
--- a/src/cfg_file.l
+++ b/src/cfg_file.l
@@ -55,6 +55,8 @@ alias return ALIAS;
pattern return PATTERN;
weight return WEIGHT;
size return SIZE;
+tokenizer return TOKENIZER;
+classifier return CLASSIFIER;
logging return LOGGING;
diff --git a/src/cfg_file.y b/src/cfg_file.y
index dc57df2a4..59d70b160 100644
--- a/src/cfg_file.y
+++ b/src/cfg_file.y
@@ -18,6 +18,8 @@
#include "cfg_file.h"
#include "main.h"
+#include "classifiers/classifiers.h"
+#include "tokenizers/tokenizers.h"
#define YYDEBUG 1
@@ -53,7 +55,7 @@ struct statfile *cur_statfile = NULL;
%token REQUIRED_SCORE FUNCTION FRACT COMPOSITES CONTROL PASSWORD
%token LOGGING LOG_TYPE LOG_TYPE_CONSOLE LOG_TYPE_SYSLOG LOG_TYPE_FILE
%token LOG_LEVEL LOG_LEVEL_DEBUG LOG_LEVEL_INFO LOG_LEVEL_WARNING LOG_LEVEL_ERROR LOG_FACILITY LOG_FILENAME
-%token STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE
+%token STATFILE ALIAS PATTERN WEIGHT STATFILE_POOL_SIZE SIZE TOKENIZER CLASSIFIER
%type <string> STRING
%type <string> VARIABLE
@@ -550,6 +552,15 @@ statfile:
yyerror ("yyparse: not enough arguments in statfile definition");
YYERROR;
}
+ if (cur_statfile->metric == NULL) {
+ cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default");
+ }
+ if (cur_statfile->classifier == NULL) {
+ cur_statfile->classifier = get_classifier ("winnow");
+ }
+ if (cur_statfile->tokenizer == NULL) {
+ cur_statfile->tokenizer = get_tokenizer ("osb-text");
+ }
g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile);
cur_statfile = NULL;
}
@@ -565,6 +576,9 @@ statfilecmd:
| statfilepattern
| statfileweight
| statfilesize
+ | statfilemetric
+ | statfiletokenizer
+ | statfileclassifier
;
statfilealias:
@@ -615,6 +629,38 @@ statfilesize:
}
;
+statfilemetric:
+ METRIC EQSIGN QUOTEDSTRING {
+ if (cur_statfile == NULL) {
+ cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
+ }
+ cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3);
+ }
+ ;
+
+statfiletokenizer:
+ TOKENIZER EQSIGN QUOTEDSTRING {
+ if (cur_statfile == NULL) {
+ cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
+ }
+ if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) {
+ yyerror ("yyparse: unknown tokenizer %s", $3);
+ YYERROR;
+ }
+ }
+ ;
+
+statfileclassifier:
+ CLASSIFIER EQSIGN QUOTEDSTRING {
+ if (cur_statfile == NULL) {
+ cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
+ }
+ if ((cur_statfile->classifier = get_classifier ($3)) == NULL) {
+ yyerror ("yyparse: unknown classifier %s", $3);
+ YYERROR;
+ }
+ }
+ ;
statfile_pool_size:
STATFILE_POOL_SIZE EQSIGN SIZELIMIT {
diff --git a/src/controller.c b/src/controller.c
index fa2fa268f..4369f7d7f 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -235,8 +235,8 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
session->learn_rcpt = NULL;
session->learn_from = NULL;
session->learn_filename = NULL;
- session->learn_tokenizer = get_tokenizer ("osb-text");
- session->learn_classifier = get_classifier ("winnow");
+ session->learn_tokenizer = statfile->tokenizer;
+ session->learn_classifier = statfile->classifier;
/* By default learn positive */
session->in_class = 1;
/* Get all arguments */
@@ -262,22 +262,6 @@ process_command (struct controller_command *cmd, char **cmd_args, struct control
}
session->learn_from = memory_pool_strdup (session->session_pool, arg);
break;
- case 't':
- arg = *(cmd_args + 1);
- if (!arg || *arg == '\0' || (session->learn_tokenizer = get_tokenizer (arg)) == NULL) {
- r = snprintf (out_buf, sizeof (out_buf), "tokenizer is not defined" CRLF, arg);
- bufferevent_write (session->bev, out_buf, r);
- return;
- }
- break;
- case 'c':
- arg = *(cmd_args + 1);
- if (!arg || *arg == '\0' || (session->learn_classifier = get_classifier (arg)) == NULL) {
- r = snprintf (out_buf, sizeof (out_buf), "classifier is not defined" CRLF, arg);
- bufferevent_write (session->bev, out_buf, r);
- return;
- }
- break;
case 'n':
session->in_class = 0;
break;
diff --git a/src/filter.c b/src/filter.c
index ae5852efb..5bf0db27a 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -8,6 +8,9 @@
#include "main.h"
#include "cfg_file.h"
#include "perl.h"
+#include "util.h"
+#include "classifiers/classifiers.h"
+#include "tokenizers/tokenizers.h"
void
insert_result (struct worker_task *task, const char *metric_name, const char *symbol, u_char flag)
@@ -330,11 +333,113 @@ composites_metric_callback (gpointer key, gpointer value, void *data)
g_hash_table_foreach (task->cfg->composite_symbols, composites_foreach_callback, cd);
}
-void make_composites (struct worker_task *task)
+void
+make_composites (struct worker_task *task)
{
g_hash_table_foreach (task->results, composites_metric_callback, task);
}
+struct statfile_callback_data {
+ GHashTable *metrics;
+ GHashTable *tokens;
+ struct worker_task *task;
+};
+
+static void
+statfiles_callback (gpointer key, gpointer value, void *arg)
+{
+ struct statfile_callback_data *data= (struct statfile_callback_data *)arg;
+ struct worker_task *task = data->task;
+ struct statfile *st = (struct statfile *)value;
+ GTree *tokens;
+ char *filename;
+ double weight, *w;
+
+ if (g_list_length (task->rcpt) == 1) {
+ filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data);
+ }
+ else {
+ /* XXX: handle multiply recipients correctly */
+ filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, "");
+ }
+
+ if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == -1) {
+ return;
+ }
+
+ if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
+ /* Tree would be freed at task pool freeing */
+ tokens = st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, task->msg->buf);
+ if (tokens == NULL) {
+ msg_info ("statfiles_callback: cannot tokenize input");
+ return;
+ }
+ g_hash_table_insert (data->tokens, st->tokenizer, tokens);
+ }
+
+ weight = st->classifier->classify_func (task->worker->srv->statfile_pool, filename, tokens);
+
+ if (weight > 0.000001) {
+ if ((w = g_hash_table_lookup (data->metrics, st->metric)) == NULL) {
+ w = memory_pool_alloc (task->task_pool, sizeof (double));
+ *w = weight * st->weight;
+ g_hash_table_insert (data->metrics, st->metric, w);
+ }
+ else {
+ *w += weight * st->weight;
+ }
+ }
+
+}
+
+static void
+statfiles_results_callback (gpointer key, gpointer value, void *arg)
+{
+ struct worker_task *task = (struct worker_task *)arg;
+ struct metric_result *metric_res;
+ struct metric *metric;
+ double w;
+
+ metric_res = g_hash_table_lookup (task->results, (char *)key);
+ w = *(double *)value;
+
+ metric = g_hash_table_lookup (task->worker->srv->cfg->metrics, (char *)key);
+ if (metric == NULL) {
+ return;
+ }
+
+ if (metric_res == NULL) {
+ /* Create new metric chain */
+ metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result));
+ metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal);
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_hash_table_destroy, metric_res->symbols);
+ metric_res->metric = metric;
+ metric_res->score = w;
+ g_hash_table_insert (task->results, key, metric_res);
+ }
+ else {
+ metric_res->score += w;
+ }
+ g_hash_table_insert (metric_res->symbols, key, GSIZE_TO_POINTER (1));
+
+}
+
+
+void
+process_statfiles (struct worker_task *task)
+{
+ struct statfile_callback_data cd;
+
+ cd.task = task;
+ cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal);
+ cd.metrics = g_hash_table_new (g_str_hash, g_str_equal);
+
+ g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd);
+ g_hash_table_foreach (cd.metrics, statfiles_results_callback, task);
+
+ g_hash_table_destroy (cd.tokens);
+ g_hash_table_destroy (cd.metrics);
+}
/*
* vi:ts=4
diff --git a/src/util.h b/src/util.h
index 9c05a2a27..5d4590c4c 100644
--- a/src/util.h
+++ b/src/util.h
@@ -42,7 +42,13 @@ int setproctitle(const char *fmt, ...);
#ifndef HAVE_PIDFILE
struct pidfh {
int pf_fd;
+#ifdef HAVE_PATH_MAX
+ char pf_path[PATH_MAX + 1];
+#elif defined(HAVE_MAXPATHLEN)
char pf_path[MAXPATHLEN + 1];
+#else
+ char pf_path[1024 + 1];
+#endif
__dev_t pf_dev;
ino_t pf_ino;
};