diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2010-12-24 18:57:55 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2010-12-24 18:57:55 +0300 |
commit | 6ae69bd5f741a04b9c38a7de83f6b6ed263e1c1f (patch) | |
tree | 29d90efa9d0457c2bf611ef5ce37533109a422b2 | |
parent | dff94626ac8a2edd24a8524cdbb00dd7dc2222d8 (diff) | |
download | rspamd-6ae69bd5f741a04b9c38a7de83f6b6ed263e1c1f.tar.gz rspamd-6ae69bd5f741a04b9c38a7de83f6b6ed263e1c1f.zip |
* Add Subject header to statistics
* Write log message about symbols that are removed when composite symbol is inserted
-rw-r--r-- | src/controller.c | 4 | ||||
-rw-r--r-- | src/filter.c | 13 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 38 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 2 |
4 files changed, 56 insertions, 1 deletions
diff --git a/src/controller.c b/src/controller.c index 617821bf6..a39ec21cb 100644 --- a/src/controller.c +++ b/src/controller.c @@ -845,7 +845,9 @@ controller_read_socket (f_str_t * in, void *arg) session->state = STATE_REPLY; return TRUE; } - + + /* Take care of subject */ + tokenize_subject (task, &tokens); /* Init classifier */ cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier); diff --git a/src/filter.c b/src/filter.c index f47dd1a3a..e1e5d06bc 100644 --- a/src/filter.c +++ b/src/filter.c @@ -319,6 +319,8 @@ composites_foreach_callback (gpointer key, gpointer value, void *data) GQueue *stack; GList *symbols = NULL, *s; gsize cur, op1, op2; + gchar logbuf[256]; + gint r; stack = g_queue_new (); @@ -367,8 +369,15 @@ composites_foreach_callback (gpointer key, gpointer value, void *data) if (op1) { /* Remove all symbols that are in composite symbol */ s = g_list_first (symbols); + r = rspamd_snprintf (logbuf, sizeof (logbuf), "<%s>, insert symbol %s instead of symbols: ", cd->task->message_id, key); while (s) { g_hash_table_remove (cd->metric_res->symbols, s->data); + if (s->next) { + r += rspamd_snprintf (logbuf + r, sizeof (logbuf) -r, "%s, ", s->data); + } + else { + r += rspamd_snprintf (logbuf + r, sizeof (logbuf) -r, "%s", s->data); + } s = g_list_next (s); } /* Add new symbol */ @@ -432,6 +441,8 @@ process_autolearn (struct statfile *st, struct worker_task *task, GTree * tokens if (check_autolearn (st->autolearn, task)) { if (tokens) { + /* Take care of subject */ + tokenize_subject (task, &tokens); msg_info ("message with id <%s> autolearned statfile '%s'", task->message_id, filename); /* Get or create statfile */ @@ -527,6 +538,8 @@ classifiers_callback (gpointer value, void *arg) return; } + /* Take care of subject */ + tokenize_subject (task, &tokens); cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task); /* Autolearning */ diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 9ca690e47..ab073a28c 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -222,6 +222,44 @@ tokenize_headers (memory_pool_t * pool, struct worker_task *task, GTree ** tree) return TRUE; } +void +tokenize_subject (struct worker_task *task, GTree ** tree) +{ + f_str_t subject, subject_name; + const gchar *sub; + token_node_t *new = NULL; + + if (*tree == NULL) { + *tree = g_tree_new (token_node_compare_func); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, *tree); + } + + subject_name.begin = "Subject:"; + subject_name.len = sizeof ("Subject:") - 1; + + /* Try to use pre-defined subject */ + if (task->subject != NULL) { + new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); + subject.begin = task->subject; + subject.len = strlen (task->subject); + new->h1 = fstrhash (&subject_name) * primes[0]; + new->h2 = fstrhash (&subject) * primes[1]; + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } + if ((sub = g_mime_message_get_subject (task->message)) != NULL) { + new = memory_pool_alloc (task->task_pool, sizeof (token_node_t)); + subject.begin = (gchar *)sub; + subject.len = strlen (sub); + new->h1 = fstrhash (&subject_name) * primes[0]; + new->h2 = fstrhash (&subject) * primes[1]; + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); + } + } +} + /* * vi:ts=4 */ diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index 9a16e907c..21e454e6b 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -40,6 +40,8 @@ int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t int tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **cur); /* Common tokenizer for headers */ int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); +/* Make tokens for a subject */ +void tokenize_subject (struct worker_task *task, GTree ** tree); /* Array of all defined tokenizers */ extern struct tokenizer tokenizers[]; |