summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-09 20:45:11 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-09 20:45:11 +0400
commit2234daebbb352b444b322d43cc6c1093f0ce949c (patch)
tree320131facabccd4f5aa3eddc465bc50a707b2b00
parent19baadf6a0e6b2554de67b674a2c6f30efda13bb (diff)
downloadrspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.tar.gz
rspamd-2234daebbb352b444b322d43cc6c1093f0ce949c.zip
* Make autolearn working
-rw-r--r--src/filter.c105
-rw-r--r--src/filter.h1
-rw-r--r--src/fstring.c5
-rw-r--r--src/statfile.c10
-rw-r--r--src/tokenizers/osb.c10
-rw-r--r--src/tokenizers/tokenizers.c8
-rw-r--r--src/util.c2
-rw-r--r--src/view.c2
8 files changed, 122 insertions, 21 deletions
diff --git a/src/filter.c b/src/filter.c
index daa9b0e29..1c45f0886 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -62,6 +62,7 @@ insert_result (struct worker_task *task, const char *metric_name, const char *sy
/* Create new metric chain */
metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result));
metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal);
+ metric_res->checked = FALSE;
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_hash_table_destroy, metric_res->symbols);
metric_res->metric = metric;
g_hash_table_insert (task->results, (gpointer)metric_name, metric_res);
@@ -214,11 +215,19 @@ call_filter_by_name (struct worker_task *task, const char *name, enum filter_typ
}
static void
-metric_process_callback (gpointer key, gpointer value, void *data)
+metric_process_callback_common (gpointer key, gpointer value, void *data, gboolean is_forced)
{
struct worker_task *task = (struct worker_task *)data;
struct metric_result *metric_res = (struct metric_result *)value;
+ if (metric_res->checked && !is_forced) {
+ /* Already checked */
+ return;
+ }
+
+ /* Set flag */
+ metric_res->checked = TRUE;
+
if (metric_res->metric->func != NULL) {
metric_res->score = metric_res->metric->func (task, metric_res->metric->name, metric_res->metric->func_name);
}
@@ -229,6 +238,18 @@ metric_process_callback (gpointer key, gpointer value, void *data)
metric_res->score, metric_res->metric->name);
}
+static void
+metric_process_callback_normal (gpointer key, gpointer value, void *data)
+{
+ metric_process_callback_common (key, value, data, FALSE);
+}
+
+static void
+metric_process_callback_forced (gpointer key, gpointer value, void *data)
+{
+ metric_process_callback_common (key, value, data, TRUE);
+}
+
static int
continue_process_filters (struct worker_task *task)
{
@@ -359,7 +380,7 @@ process_filters (struct worker_task *task)
}
/* Process all metrics */
- g_hash_table_foreach (task->results, metric_process_callback, task);
+ g_hash_table_foreach (task->results, metric_process_callback_forced, task);
return 1;
}
@@ -443,6 +464,75 @@ composites_foreach_callback (gpointer key, gpointer value, void *data)
return;
}
+static gboolean
+check_autolearn (struct statfile_autolearn_params *params, struct worker_task *task)
+{
+ const char *metric_name = DEFAULT_METRIC;
+ struct metric_result *metric_res;
+ GList *cur;
+
+ if (params->metric != NULL) {
+ metric_name = params->metric;
+ }
+
+ /* First check threshold */
+ metric_res = g_hash_table_lookup (task->results, metric_name);
+ if (metric_res == NULL) {
+ if (params->symbols == NULL && params->threshold_max > 0) {
+ /* For ham messages */
+ return TRUE;
+ }
+ msg_debug ("check_autolearn: metric %s has no results", metric_name);
+ return FALSE;
+ }
+ else {
+ /* Process score of metric */
+ metric_process_callback_normal ((void *)metric_name, metric_res, task);
+ if ((params->threshold_min != 0 && metric_res->score > params->threshold_min) ||
+ (params->threshold_max != 0 && metric_res->score < params->threshold_max)) {
+ /* Now check for specific symbols */
+ if (params->symbols) {
+ cur = params->symbols;
+ while (cur) {
+ if (g_hash_table_lookup (metric_res->symbols, cur->data) == NULL) {
+ return FALSE;
+ }
+ cur = g_list_next (cur);
+ }
+ }
+ /* Now allow processing of actual autolearn */
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static void
+process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens,
+ struct classifier *classifier, char *filename, struct classifier_ctx* ctx)
+{
+ if (check_autolearn (st->autolearn, task)) {
+ if (tokens) {
+ msg_info ("process_autolearn: message with id <%s> autolearned statfile '%s'", task->message_id, filename);
+ /* Check opened */
+ if (! statfile_pool_is_open (task->worker->srv->statfile_pool, filename)) {
+ /* Try open */
+ if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL) {
+ /* Try create */
+ if (statfile_pool_create (task->worker->srv->statfile_pool,
+ filename, st->size / sizeof (struct stat_file_block)) == -1) {
+ msg_info ("process_autolearn: error while creating statfile %s", filename);
+ return;
+ }
+ }
+ }
+
+ classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, 1);
+ }
+ }
+}
+
static void
composites_metric_callback (gpointer key, gpointer value, void *data)
{
@@ -498,7 +588,7 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, "");
}
- if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL) {
+ if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL && !check_autolearn (st->autolearn, task)) {
return;
}
@@ -513,6 +603,7 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
msg_info ("statfiles_callback: cannot tokenize input");
return;
}
+ cur = g_list_next (cur);
}
g_hash_table_insert (data->tokens, st->tokenizer, tokens);
}
@@ -533,6 +624,10 @@ statfiles_callback (gpointer key, gpointer value, void *arg)
classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight);
+ if (st->autolearn) {
+ /* Process autolearn */
+ process_autolearn (st, task, tokens, classifier, filename, res_data->ctx);
+ }
}
static void
@@ -548,7 +643,6 @@ statfiles_results_callback (gpointer key, gpointer value, void *arg)
filename = classifier->result_file_func (res->ctx, w);
insert_result (task, res->metric->name, classifier->name, *w, NULL);
msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name);
-
}
@@ -566,7 +660,8 @@ process_statfiles (struct worker_task *task)
g_hash_table_destroy (cd.tokens);
g_hash_table_destroy (cd.classifiers);
- g_hash_table_foreach (task->results, metric_process_callback, task);
+ /* Process results */
+ g_hash_table_foreach (task->results, metric_process_callback_forced, task);
task->state = WRITE_REPLY;
}
diff --git a/src/filter.h b/src/filter.h
index c460ec317..e0c989f85 100644
--- a/src/filter.h
+++ b/src/filter.h
@@ -49,6 +49,7 @@ struct metric_result {
struct metric *metric; /**< pointer to metric structure */
double score; /**< total score */
GHashTable *symbols; /**< symbols of metric */
+ gboolean checked; /**< whether metric result is consolidated */
};
/**
diff --git a/src/fstring.c b/src/fstring.c
index 935c8bdcc..00ca4ed12 100644
--- a/src/fstring.c
+++ b/src/fstring.c
@@ -306,19 +306,20 @@ fstrhash (f_str_t *str)
size_t i;
uint32_t hval;
uint32_t tmp;
+ char *c = str->begin;
if (str == NULL) {
return 0;
}
hval = str->len;
- for (i = 0; i < str->len; i++) {
+ for (i = 0; i < str->len; i++, c++) {
/*
* xor in the current byte against each byte of hval
* (which alone gaurantees that every bit of input will have
* an effect on the output)
*/
- tmp = *(str->begin + i) & 0xFF;
+ tmp = *c & 0xFF;
tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24);
hval ^= tmp;
diff --git a/src/statfile.c b/src/statfile.c
index ac0c3bfaa..4a52008ed 100644
--- a/src/statfile.c
+++ b/src/statfile.c
@@ -129,9 +129,8 @@ statfile_pool_open (statfile_pool_t *pool, char *filename)
struct stat st;
stat_file_t *new_file;
- if (statfile_pool_is_open (pool, filename) != NULL) {
- msg_info ("statfile_pool_open: file %s is already opened", filename);
- return NULL;
+ if ((new_file = statfile_pool_is_open (pool, filename)) != NULL) {
+ return new_file;
}
if (pool->opened >= STATFILES_MAX - 1) {
@@ -400,9 +399,10 @@ statfile_pool_set_block (statfile_pool_t *pool, stat_file_t *file, uint32_t h1,
stat_file_t *
statfile_pool_is_open (statfile_pool_t *pool, char *filename)
{
- static stat_file_t f;
+ static stat_file_t f, *ret;
f.filename = filename;
- return bsearch (&f, pool->files, pool->opened, sizeof (stat_file_t), cmpstatfile);
+ ret = bsearch (&f, pool->files, pool->opened, sizeof (stat_file_t), cmpstatfile);
+ return ret;
}
uint32_t
diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c
index 32d6b902a..d2a1fe22f 100644
--- a/src/tokenizers/osb.c
+++ b/src/tokenizers/osb.c
@@ -29,6 +29,8 @@
#include <sys/types.h>
#include "tokenizers.h"
+/* Minimum length of token */
+#define MIN_LEN 4
extern const int primes[];
@@ -36,7 +38,7 @@ int
osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree)
{
token_node_t *new = NULL;
- f_str_t token = { NULL, 0, 0 };
+ f_str_t token = { NULL, 0, 0 }, *res;
uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2;
int i;
@@ -52,7 +54,11 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in
msg_debug ("osb_tokenize_text: got input length: %zd", input->len);
- while (tokenizer->get_next_word (input, &token)) {
+ while ((res = tokenizer->get_next_word (input, &token)) != NULL) {
+ /* Skip small words */
+ if (token.len < MIN_LEN) {
+ continue;
+ }
/* Shift hashpipe */
for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) {
hashpipe[i] = hashpipe[i - 1];
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 4527e699c..7db1af12c 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -78,12 +78,11 @@ f_str_t *
get_next_word (f_str_t *buf, f_str_t *token)
{
size_t remain;
- char *pos;
+ unsigned char *pos;
if (buf == NULL) {
return NULL;
}
-
if (token->begin == NULL) {
token->begin = buf->begin;
}
@@ -95,15 +94,14 @@ get_next_word (f_str_t *buf, f_str_t *token)
if (remain <= 0) {
return NULL;
}
-
pos = token->begin;
/* Skip non graph symbols */
- while (remain > 0 && !g_ascii_isgraph (*pos)) {
+ while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) {
token->begin ++;
pos ++;
remain --;
}
- while (remain > 0 && g_ascii_isgraph (*pos)) {
+ while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) {
token->len ++;
pos ++;
remain --;
diff --git a/src/util.c b/src/util.c
index 76c9c31a8..9dab02da7 100644
--- a/src/util.c
+++ b/src/util.c
@@ -1052,7 +1052,7 @@ maybe_parse_host_list (memory_pool_t *pool, GHashTable *tbl, const char *filenam
gint
rspamd_strcase_equal (gconstpointer v, gconstpointer v2)
{
- return g_ascii_strcasecmp ((const char *) v, (const char *) v2) == 0;
+ return g_ascii_strcasecmp ((const char *) v, (const char *) v2);
}
diff --git a/src/view.c b/src/view.c
index 0bd534b32..0a03d4304 100644
--- a/src/view.c
+++ b/src/view.c
@@ -120,7 +120,7 @@ find_view_by_ip (GList *views, struct worker_task *task)
cur = views;
while (cur) {
v = cur->data;
- if (radix32tree_find (v->ip_tree, task->from_addr.s_addr) != RADIX_NO_VALUE) {
+ if (radix32tree_find (v->ip_tree, ntohl (task->from_addr.s_addr)) != RADIX_NO_VALUE) {
return v;
}
cur = g_list_next (cur);