From 2234daebbb352b444b322d43cc6c1093f0ce949c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 9 Jul 2009 20:45:11 +0400 Subject: [PATCH] * Make autolearn working --- src/filter.c | 105 ++++++++++++++++++++++++++++++++++-- src/filter.h | 1 + src/fstring.c | 5 +- src/statfile.c | 10 ++-- src/tokenizers/osb.c | 10 +++- src/tokenizers/tokenizers.c | 8 ++- src/util.c | 2 +- src/view.c | 2 +- 8 files changed, 122 insertions(+), 21 deletions(-) diff --git a/src/filter.c b/src/filter.c index daa9b0e29..1c45f0886 100644 --- a/src/filter.c +++ b/src/filter.c @@ -62,6 +62,7 @@ insert_result (struct worker_task *task, const char *metric_name, const char *sy /* Create new metric chain */ metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result)); metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal); + metric_res->checked = FALSE; memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_hash_table_destroy, metric_res->symbols); metric_res->metric = metric; g_hash_table_insert (task->results, (gpointer)metric_name, metric_res); @@ -214,11 +215,19 @@ call_filter_by_name (struct worker_task *task, const char *name, enum filter_typ } static void -metric_process_callback (gpointer key, gpointer value, void *data) +metric_process_callback_common (gpointer key, gpointer value, void *data, gboolean is_forced) { struct worker_task *task = (struct worker_task *)data; struct metric_result *metric_res = (struct metric_result *)value; + if (metric_res->checked && !is_forced) { + /* Already checked */ + return; + } + + /* Set flag */ + metric_res->checked = TRUE; + if (metric_res->metric->func != NULL) { metric_res->score = metric_res->metric->func (task, metric_res->metric->name, metric_res->metric->func_name); } @@ -229,6 +238,18 @@ metric_process_callback (gpointer key, gpointer value, void *data) metric_res->score, metric_res->metric->name); } +static void +metric_process_callback_normal (gpointer key, gpointer value, void *data) +{ + metric_process_callback_common (key, value, data, FALSE); +} + +static void +metric_process_callback_forced (gpointer key, gpointer value, void *data) +{ + metric_process_callback_common (key, value, data, TRUE); +} + static int continue_process_filters (struct worker_task *task) { @@ -359,7 +380,7 @@ process_filters (struct worker_task *task) } /* Process all metrics */ - g_hash_table_foreach (task->results, metric_process_callback, task); + g_hash_table_foreach (task->results, metric_process_callback_forced, task); return 1; } @@ -443,6 +464,75 @@ composites_foreach_callback (gpointer key, gpointer value, void *data) return; } +static gboolean +check_autolearn (struct statfile_autolearn_params *params, struct worker_task *task) +{ + const char *metric_name = DEFAULT_METRIC; + struct metric_result *metric_res; + GList *cur; + + if (params->metric != NULL) { + metric_name = params->metric; + } + + /* First check threshold */ + metric_res = g_hash_table_lookup (task->results, metric_name); + if (metric_res == NULL) { + if (params->symbols == NULL && params->threshold_max > 0) { + /* For ham messages */ + return TRUE; + } + msg_debug ("check_autolearn: metric %s has no results", metric_name); + return FALSE; + } + else { + /* Process score of metric */ + metric_process_callback_normal ((void *)metric_name, metric_res, task); + if ((params->threshold_min != 0 && metric_res->score > params->threshold_min) || + (params->threshold_max != 0 && metric_res->score < params->threshold_max)) { + /* Now check for specific symbols */ + if (params->symbols) { + cur = params->symbols; + while (cur) { + if (g_hash_table_lookup (metric_res->symbols, cur->data) == NULL) { + return FALSE; + } + cur = g_list_next (cur); + } + } + /* Now allow processing of actual autolearn */ + return TRUE; + } + } + + return FALSE; +} + +static void +process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens, + struct classifier *classifier, char *filename, struct classifier_ctx* ctx) +{ + if (check_autolearn (st->autolearn, task)) { + if (tokens) { + msg_info ("process_autolearn: message with id <%s> autolearned statfile '%s'", task->message_id, filename); + /* Check opened */ + if (! statfile_pool_is_open (task->worker->srv->statfile_pool, filename)) { + /* Try open */ + if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL) { + /* Try create */ + if (statfile_pool_create (task->worker->srv->statfile_pool, + filename, st->size / sizeof (struct stat_file_block)) == -1) { + msg_info ("process_autolearn: error while creating statfile %s", filename); + return; + } + } + } + + classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, 1); + } + } +} + static void composites_metric_callback (gpointer key, gpointer value, void *data) { @@ -498,7 +588,7 @@ statfiles_callback (gpointer key, gpointer value, void *arg) filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, ""); } - if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL) { + if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL && !check_autolearn (st->autolearn, task)) { return; } @@ -513,6 +603,7 @@ statfiles_callback (gpointer key, gpointer value, void *arg) msg_info ("statfiles_callback: cannot tokenize input"); return; } + cur = g_list_next (cur); } g_hash_table_insert (data->tokens, st->tokenizer, tokens); } @@ -533,6 +624,10 @@ statfiles_callback (gpointer key, gpointer value, void *arg) classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight); + if (st->autolearn) { + /* Process autolearn */ + process_autolearn (st, task, tokens, classifier, filename, res_data->ctx); + } } static void @@ -548,7 +643,6 @@ statfiles_results_callback (gpointer key, gpointer value, void *arg) filename = classifier->result_file_func (res->ctx, w); insert_result (task, res->metric->name, classifier->name, *w, NULL); msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name); - } @@ -566,7 +660,8 @@ process_statfiles (struct worker_task *task) g_hash_table_destroy (cd.tokens); g_hash_table_destroy (cd.classifiers); - g_hash_table_foreach (task->results, metric_process_callback, task); + /* Process results */ + g_hash_table_foreach (task->results, metric_process_callback_forced, task); task->state = WRITE_REPLY; } diff --git a/src/filter.h b/src/filter.h index c460ec317..e0c989f85 100644 --- a/src/filter.h +++ b/src/filter.h @@ -49,6 +49,7 @@ struct metric_result { struct metric *metric; /**< pointer to metric structure */ double score; /**< total score */ GHashTable *symbols; /**< symbols of metric */ + gboolean checked; /**< whether metric result is consolidated */ }; /** diff --git a/src/fstring.c b/src/fstring.c index 935c8bdcc..00ca4ed12 100644 --- a/src/fstring.c +++ b/src/fstring.c @@ -306,19 +306,20 @@ fstrhash (f_str_t *str) size_t i; uint32_t hval; uint32_t tmp; + char *c = str->begin; if (str == NULL) { return 0; } hval = str->len; - for (i = 0; i < str->len; i++) { + for (i = 0; i < str->len; i++, c++) { /* * xor in the current byte against each byte of hval * (which alone gaurantees that every bit of input will have * an effect on the output) */ - tmp = *(str->begin + i) & 0xFF; + tmp = *c & 0xFF; tmp = tmp | (tmp << 8) | (tmp << 16) | (tmp << 24); hval ^= tmp; diff --git a/src/statfile.c b/src/statfile.c index ac0c3bfaa..4a52008ed 100644 --- a/src/statfile.c +++ b/src/statfile.c @@ -129,9 +129,8 @@ statfile_pool_open (statfile_pool_t *pool, char *filename) struct stat st; stat_file_t *new_file; - if (statfile_pool_is_open (pool, filename) != NULL) { - msg_info ("statfile_pool_open: file %s is already opened", filename); - return NULL; + if ((new_file = statfile_pool_is_open (pool, filename)) != NULL) { + return new_file; } if (pool->opened >= STATFILES_MAX - 1) { @@ -400,9 +399,10 @@ statfile_pool_set_block (statfile_pool_t *pool, stat_file_t *file, uint32_t h1, stat_file_t * statfile_pool_is_open (statfile_pool_t *pool, char *filename) { - static stat_file_t f; + static stat_file_t f, *ret; f.filename = filename; - return bsearch (&f, pool->files, pool->opened, sizeof (stat_file_t), cmpstatfile); + ret = bsearch (&f, pool->files, pool->opened, sizeof (stat_file_t), cmpstatfile); + return ret; } uint32_t diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 32d6b902a..d2a1fe22f 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -29,6 +29,8 @@ #include #include "tokenizers.h" +/* Minimum length of token */ +#define MIN_LEN 4 extern const int primes[]; @@ -36,7 +38,7 @@ int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **tree) { token_node_t *new = NULL; - f_str_t token = { NULL, 0, 0 }; + f_str_t token = { NULL, 0, 0 }, *res; uint32_t hashpipe[FEATURE_WINDOW_SIZE], h1, h2; int i; @@ -52,7 +54,11 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *in msg_debug ("osb_tokenize_text: got input length: %zd", input->len); - while (tokenizer->get_next_word (input, &token)) { + while ((res = tokenizer->get_next_word (input, &token)) != NULL) { + /* Skip small words */ + if (token.len < MIN_LEN) { + continue; + } /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i --) { hashpipe[i] = hashpipe[i - 1]; diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 4527e699c..7db1af12c 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -78,12 +78,11 @@ f_str_t * get_next_word (f_str_t *buf, f_str_t *token) { size_t remain; - char *pos; + unsigned char *pos; if (buf == NULL) { return NULL; } - if (token->begin == NULL) { token->begin = buf->begin; } @@ -95,15 +94,14 @@ get_next_word (f_str_t *buf, f_str_t *token) if (remain <= 0) { return NULL; } - pos = token->begin; /* Skip non graph symbols */ - while (remain > 0 && !g_ascii_isgraph (*pos)) { + while (remain > 0 && (!g_ascii_isgraph (*pos) && *pos < 127)) { token->begin ++; pos ++; remain --; } - while (remain > 0 && g_ascii_isgraph (*pos)) { + while (remain > 0 && (g_ascii_isgraph (*pos) || *pos > 127)) { token->len ++; pos ++; remain --; diff --git a/src/util.c b/src/util.c index 76c9c31a8..9dab02da7 100644 --- a/src/util.c +++ b/src/util.c @@ -1052,7 +1052,7 @@ maybe_parse_host_list (memory_pool_t *pool, GHashTable *tbl, const char *filenam gint rspamd_strcase_equal (gconstpointer v, gconstpointer v2) { - return g_ascii_strcasecmp ((const char *) v, (const char *) v2) == 0; + return g_ascii_strcasecmp ((const char *) v, (const char *) v2); } diff --git a/src/view.c b/src/view.c index 0bd534b32..0a03d4304 100644 --- a/src/view.c +++ b/src/view.c @@ -120,7 +120,7 @@ find_view_by_ip (GList *views, struct worker_task *task) cur = views; while (cur) { v = cur->data; - if (radix32tree_find (v->ip_tree, task->from_addr.s_addr) != RADIX_NO_VALUE) { + if (radix32tree_find (v->ip_tree, ntohl (task->from_addr.s_addr)) != RADIX_NO_VALUE) { return v; } cur = g_list_next (cur); -- 2.39.5