diff options
Diffstat (limited to 'src/libstat/classifiers')
-rw-r--r-- | src/libstat/classifiers/bayes.c | 341 | ||||
-rw-r--r-- | src/libstat/classifiers/classifiers.h | 88 | ||||
-rw-r--r-- | src/libstat/classifiers/lua_classifier.c | 216 |
3 files changed, 322 insertions, 323 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 6709bb75a..513db9af9 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -21,25 +21,25 @@ #include "stat_internal.h" #include "math.h" -#define msg_err_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \ - "bayes", task->task_pool->tag.uid, \ - RSPAMD_LOG_FUNC, \ - __VA_ARGS__) -#define msg_warn_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \ - "bayes", task->task_pool->tag.uid, \ - RSPAMD_LOG_FUNC, \ - __VA_ARGS__) -#define msg_info_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \ - "bayes", task->task_pool->tag.uid, \ - RSPAMD_LOG_FUNC, \ - __VA_ARGS__) +#define msg_err_bayes(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "bayes", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_bayes(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "bayes", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_bayes(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "bayes", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) INIT_LOG_MODULE_PUBLIC(bayes) static inline GQuark -bayes_error_quark (void) +bayes_error_quark(void) { - return g_quark_from_static_string ("bayes-error"); + return g_quark_from_static_string("bayes-error"); } /** @@ -50,21 +50,21 @@ bayes_error_quark (void) * @return */ static gdouble -inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg) +inv_chi_square(struct rspamd_task *task, gdouble value, gint freedom_deg) { double prob, sum, m; gint i; errno = 0; m = -value; - prob = exp (value); + prob = exp(value); if (errno == ERANGE) { /* * e^x where x is large *NEGATIVE* number is OK, so we have a very strong * confidence that inv-chi-square is close to zero */ - msg_debug_bayes ("exp overflow"); + msg_debug_bayes("exp overflow"); if (value < 0) { return 0; @@ -76,7 +76,7 @@ inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg) sum = prob; - msg_debug_bayes ("m: %f, probability: %g", m, prob); + msg_debug_bayes("m: %f, probability: %g", m, prob); /* * m is our confidence in class @@ -85,12 +85,12 @@ inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg) * from 1.0 (no confidence) to 0.0 (full confidence) */ for (i = 1; i < freedom_deg; i++) { - prob *= m / (gdouble)i; + prob *= m / (gdouble) i; sum += prob; - msg_debug_bayes ("i=%d, probability: %g, sum: %g", i, prob, sum); + msg_debug_bayes("i=%d, probability: %g, sum: %g", i, prob, sum); } - return MIN (1.0, sum); + return MIN(1.0, sum); } struct bayes_task_closure { @@ -107,15 +107,15 @@ struct bayes_task_closure { * Mathematically we use pow(complexity, complexity), where complexity is the * window index */ -static const double feature_weight[] = { 0, 3125, 256, 27, 1, 0, 0, 0 }; +static const double feature_weight[] = {0, 3125, 256, 27, 1, 0, 0, 0}; #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt))) /* * In this callback we calculate local probabilities for tokens */ static void -bayes_classify_token (struct rspamd_classifier *ctx, - rspamd_token_t *tok, struct bayes_task_closure *cl) +bayes_classify_token(struct rspamd_classifier *ctx, + rspamd_token_t *tok, struct bayes_task_closure *cl) { guint i; gint id; @@ -136,15 +136,15 @@ bayes_classify_token (struct rspamd_classifier *ctx, #endif if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_META && cl->meta_skip_prob > 0) { - val = rspamd_random_double_fast (); + val = rspamd_random_double_fast(); if (val <= cl->meta_skip_prob) { if (tok->t1 && tok->t2) { - msg_debug_bayes ( - "token(meta) %uL <%*s:%*s> probabilistically skipped", - tok->data, - (int) tok->t1->original.len, tok->t1->original.begin, - (int) tok->t2->original.len, tok->t2->original.begin); + msg_debug_bayes( + "token(meta) %uL <%*s:%*s> probabilistically skipped", + tok->data, + (int) tok->t1->original.len, tok->t1->original.begin, + (int) tok->t2->original.len, tok->t2->original.begin); } return; @@ -152,9 +152,9 @@ bayes_classify_token (struct rspamd_classifier *ctx, } for (i = 0; i < ctx->statfiles_ids->len; i++) { - id = g_array_index (ctx->statfiles_ids, gint, i); - st = g_ptr_array_index (ctx->ctx->statfiles, id); - g_assert (st != NULL); + id = g_array_index(ctx->statfiles_ids, gint, i); + st = g_ptr_array_index(ctx->ctx->statfiles, id); + g_assert(st != NULL); val = tok->values[id]; if (val > 0) { @@ -172,8 +172,8 @@ bayes_classify_token (struct rspamd_classifier *ctx, /* Probability for this token */ if (total_count >= ctx->cfg->min_token_hits) { - spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns)); - ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns)); + spam_freq = ((double) spam_count / MAX(1., (double) ctx->spam_learns)); + ham_freq = ((double) ham_count / MAX(1., (double) ctx->ham_learns)); spam_prob = spam_freq / (spam_freq + ham_freq); ham_prob = ham_freq / (spam_freq + ham_freq); @@ -182,93 +182,91 @@ bayes_classify_token (struct rspamd_classifier *ctx, } else { fw = feature_weight[tok->window_idx % - G_N_ELEMENTS (feature_weight)]; + G_N_ELEMENTS(feature_weight)]; } w = (fw * total_count) / (1.0 + fw * total_count); - bayes_spam_prob = PROB_COMBINE (spam_prob, total_count, w, 0.5); + bayes_spam_prob = PROB_COMBINE(spam_prob, total_count, w, 0.5); if ((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + ctx->cfg->min_prob_strength) || (bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - ctx->cfg->min_prob_strength)) { - msg_debug_bayes ( - "token %uL <%*s:%*s> skipped, probability not in range: %f", - tok->data, - (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, - (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, - bayes_spam_prob); + msg_debug_bayes( + "token %uL <%*s:%*s> skipped, probability not in range: %f", + tok->data, + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, + bayes_spam_prob); return; } - bayes_ham_prob = PROB_COMBINE (ham_prob, total_count, w, 0.5); + bayes_ham_prob = PROB_COMBINE(ham_prob, total_count, w, 0.5); - cl->spam_prob += log (bayes_spam_prob); - cl->ham_prob += log (bayes_ham_prob); - cl->processed_tokens ++; + cl->spam_prob += log(bayes_spam_prob); + cl->ham_prob += log(bayes_ham_prob); + cl->processed_tokens++; if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) { - cl->text_tokens ++; + cl->text_tokens++; } else { token_type = "meta"; } if (tok->t1 && tok->t2) { - msg_debug_bayes ("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, " - "total_count: %ud, " - "spam_count: %ud, ham_count: %ud," - "spam_prob: %.3f, ham_prob: %.3f, " - "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " - "current spam probability: %.3f, current ham probability: %.3f", - token_type, - tok->data, - (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, - (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, - fw, w, total_count, spam_count, ham_count, - spam_prob, ham_prob, - bayes_spam_prob, bayes_ham_prob, - cl->spam_prob, cl->ham_prob); + msg_debug_bayes("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, " + "total_count: %ud, " + "spam_count: %ud, ham_count: %ud," + "spam_prob: %.3f, ham_prob: %.3f, " + "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " + "current spam probability: %.3f, current ham probability: %.3f", + token_type, + tok->data, + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, + fw, w, total_count, spam_count, ham_count, + spam_prob, ham_prob, + bayes_spam_prob, bayes_ham_prob, + cl->spam_prob, cl->ham_prob); } else { - msg_debug_bayes ("token(%s) %uL <?:?>: weight: %f, cf: %f, " - "total_count: %ud, " - "spam_count: %ud, ham_count: %ud," - "spam_prob: %.3f, ham_prob: %.3f, " - "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " - "current spam probability: %.3f, current ham probability: %.3f", - token_type, - tok->data, - fw, w, total_count, spam_count, ham_count, - spam_prob, ham_prob, - bayes_spam_prob, bayes_ham_prob, - cl->spam_prob, cl->ham_prob); + msg_debug_bayes("token(%s) %uL <?:?>: weight: %f, cf: %f, " + "total_count: %ud, " + "spam_count: %ud, ham_count: %ud," + "spam_prob: %.3f, ham_prob: %.3f, " + "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " + "current spam probability: %.3f, current ham probability: %.3f", + token_type, + tok->data, + fw, w, total_count, spam_count, ham_count, + spam_prob, ham_prob, + bayes_spam_prob, bayes_ham_prob, + cl->spam_prob, cl->ham_prob); } } } - gboolean -bayes_init (struct rspamd_config *cfg, - struct ev_loop *ev_base, - struct rspamd_classifier *cl) +bayes_init(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct rspamd_classifier *cl) { cl->cfg->flags |= RSPAMD_FLAG_CLASSIFIER_INTEGER; return TRUE; } -void -bayes_fin (struct rspamd_classifier *cl) +void bayes_fin(struct rspamd_classifier *cl) { } gboolean -bayes_classify (struct rspamd_classifier * ctx, - GPtrArray *tokens, - struct rspamd_task *task) +bayes_classify(struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task) { double final_prob, h, s, *pprob; gchar sumbuf[32]; @@ -278,41 +276,41 @@ bayes_classify (struct rspamd_classifier * ctx, guint i, text_tokens = 0; gint id; - g_assert (ctx != NULL); - g_assert (tokens != NULL); + g_assert(ctx != NULL); + g_assert(tokens != NULL); - memset (&cl, 0, sizeof (cl)); + memset(&cl, 0, sizeof(cl)); cl.task = task; /* Check min learns */ if (ctx->cfg->min_learns > 0) { if (ctx->ham_learns < ctx->cfg->min_learns) { - msg_info_task ("not classified as ham. The ham class needs more " - "training samples. Currently: %ul; minimum %ud required", - ctx->ham_learns, ctx->cfg->min_learns); + msg_info_task("not classified as ham. The ham class needs more " + "training samples. Currently: %ul; minimum %ud required", + ctx->ham_learns, ctx->cfg->min_learns); return TRUE; } if (ctx->spam_learns < ctx->cfg->min_learns) { - msg_info_task ("not classified as spam. The spam class needs more " - "training samples. Currently: %ul; minimum %ud required", - ctx->spam_learns, ctx->cfg->min_learns); + msg_info_task("not classified as spam. The spam class needs more " + "training samples. Currently: %ul; minimum %ud required", + ctx->spam_learns, ctx->cfg->min_learns); return TRUE; } } - for (i = 0; i < tokens->len; i ++) { - tok = g_ptr_array_index (tokens, i); + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index(tokens, i); if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) { - text_tokens ++; + text_tokens++; } } if (text_tokens == 0) { - msg_info_task ("skipped classification as there are no text tokens. " - "Total tokens: %ud", - tokens->len); + msg_info_task("skipped classification as there are no text tokens. " + "Total tokens: %ud", + tokens->len); return TRUE; } @@ -327,42 +325,42 @@ bayes_classify (struct rspamd_classifier * ctx, cl.meta_skip_prob = 1.0 - text_tokens / tokens->len; } - for (i = 0; i < tokens->len; i ++) { - tok = g_ptr_array_index (tokens, i); + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index(tokens, i); - bayes_classify_token (ctx, tok, &cl); + bayes_classify_token(ctx, tok, &cl); } if (cl.processed_tokens == 0) { - msg_info_bayes ("no tokens found in bayes database " - "(%ud total tokens, %ud text tokens), ignore stats", - tokens->len, text_tokens); + msg_info_bayes("no tokens found in bayes database " + "(%ud total tokens, %ud text tokens), ignore stats", + tokens->len, text_tokens); return TRUE; } if (ctx->cfg->min_tokens > 0 && - cl.text_tokens < (gint)(ctx->cfg->min_tokens * 0.1)) { - msg_info_bayes ("ignore bayes probability since we have " - "found too few text tokens: %uL (of %ud checked), " - "at least %d required", - cl.text_tokens, - text_tokens, - (gint)(ctx->cfg->min_tokens * 0.1)); + cl.text_tokens < (gint) (ctx->cfg->min_tokens * 0.1)) { + msg_info_bayes("ignore bayes probability since we have " + "found too few text tokens: %uL (of %ud checked), " + "at least %d required", + cl.text_tokens, + text_tokens, + (gint) (ctx->cfg->min_tokens * 0.1)); return TRUE; } if (cl.spam_prob > -300 && cl.ham_prob > -300) { /* Fisher value is low enough to apply inv_chi_square */ - h = 1 - inv_chi_square (task, cl.spam_prob, cl.processed_tokens); - s = 1 - inv_chi_square (task, cl.ham_prob, cl.processed_tokens); + h = 1 - inv_chi_square(task, cl.spam_prob, cl.processed_tokens); + s = 1 - inv_chi_square(task, cl.ham_prob, cl.processed_tokens); } else { /* Use naive method */ if (cl.spam_prob < cl.ham_prob) { h = (1.0 - exp(cl.spam_prob - cl.ham_prob)) / - (1.0 + exp(cl.spam_prob - cl.ham_prob)); + (1.0 + exp(cl.spam_prob - cl.ham_prob)); s = 1.0 - h; } else { @@ -372,51 +370,51 @@ bayes_classify (struct rspamd_classifier * ctx, } } - if (isfinite (s) && isfinite (h)) { + if (isfinite(s) && isfinite(h)) { final_prob = (s + 1.0 - h) / 2.; - msg_debug_bayes ( - "got ham probability %.2f -> %.2f and spam probability %.2f -> %.2f," - " %L tokens processed of %ud total tokens;" - " %uL text tokens found of %ud text tokens)", - cl.ham_prob, - h, - cl.spam_prob, - s, - cl.processed_tokens, - tokens->len, - cl.text_tokens, - text_tokens); + msg_debug_bayes( + "got ham probability %.2f -> %.2f and spam probability %.2f -> %.2f," + " %L tokens processed of %ud total tokens;" + " %uL text tokens found of %ud text tokens)", + cl.ham_prob, + h, + cl.spam_prob, + s, + cl.processed_tokens, + tokens->len, + cl.text_tokens, + text_tokens); } else { /* * We have some overflow, hence we need to check which class * is NaN */ - if (isfinite (h)) { + if (isfinite(h)) { final_prob = 1.0; - msg_debug_bayes ("spam class is full: no" - " ham samples"); + msg_debug_bayes("spam class is full: no" + " ham samples"); } - else if (isfinite (s)) { + else if (isfinite(s)) { final_prob = 0.0; - msg_debug_bayes ("ham class is full: no" - " spam samples"); + msg_debug_bayes("ham class is full: no" + " spam samples"); } else { final_prob = 0.5; - msg_warn_bayes ("spam and ham classes are both full"); + msg_warn_bayes("spam and ham classes are both full"); } } - pprob = rspamd_mempool_alloc (task->task_pool, sizeof (*pprob)); + pprob = rspamd_mempool_alloc(task->task_pool, sizeof(*pprob)); *pprob = final_prob; - rspamd_mempool_set_variable (task->task_pool, "bayes_prob", pprob, NULL); + rspamd_mempool_set_variable(task->task_pool, "bayes_prob", pprob, NULL); - if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) { + if (cl.processed_tokens > 0 && fabs(final_prob - 0.5) > 0.05) { /* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */ for (i = 0; i < ctx->statfiles_ids->len; i++) { - id = g_array_index (ctx->statfiles_ids, gint, i); - st = g_ptr_array_index (ctx->ctx->statfiles, id); + id = g_array_index(ctx->statfiles_ids, gint, i); + st = g_ptr_array_index(ctx->ctx->statfiles, id); if (final_prob > 0.5 && st->stcf->is_spam) { break; @@ -435,14 +433,15 @@ bayes_classify (struct rspamd_classifier * ctx, * Bayes p is from 0.5 to 1.0, but confidence is from 0 to 1, so * we need to rescale it to display correctly */ - rspamd_snprintf (sumbuf, sizeof (sumbuf), "%.2f%%", - (final_prob - 0.5) * 200.); - final_prob = rspamd_normalize_probability (final_prob, 0.5); - g_assert (st != NULL); + rspamd_snprintf(sumbuf, sizeof(sumbuf), "%.2f%%", + (final_prob - 0.5) * 200.); + final_prob = rspamd_normalize_probability(final_prob, 0.5); + g_assert(st != NULL); if (final_prob > 1 || final_prob < 0) { - msg_err_bayes ("internal error: probability %f is outside of the " - "allowed range [0..1]", final_prob); + msg_err_bayes("internal error: probability %f is outside of the " + "allowed range [0..1]", + final_prob); if (final_prob > 1) { final_prob = 1.0; @@ -452,22 +451,22 @@ bayes_classify (struct rspamd_classifier * ctx, } } - rspamd_task_insert_result (task, - st->stcf->symbol, - final_prob, - sumbuf); + rspamd_task_insert_result(task, + st->stcf->symbol, + final_prob, + sumbuf); } return TRUE; } gboolean -bayes_learn_spam (struct rspamd_classifier * ctx, - GPtrArray *tokens, - struct rspamd_task *task, - gboolean is_spam, - gboolean unlearn, - GError **err) +bayes_learn_spam(struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task, + gboolean is_spam, + gboolean unlearn, + GError **err) { guint i, j, total_cnt, spam_cnt, ham_cnt; gint id; @@ -475,8 +474,8 @@ bayes_learn_spam (struct rspamd_classifier * ctx, rspamd_token_t *tok; gboolean incrementing; - g_assert (ctx != NULL); - g_assert (tokens != NULL); + g_assert(ctx != NULL); + g_assert(tokens != NULL); incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND; @@ -484,12 +483,12 @@ bayes_learn_spam (struct rspamd_classifier * ctx, total_cnt = 0; spam_cnt = 0; ham_cnt = 0; - tok = g_ptr_array_index (tokens, i); + tok = g_ptr_array_index(tokens, i); for (j = 0; j < ctx->statfiles_ids->len; j++) { - id = g_array_index (ctx->statfiles_ids, gint, j); - st = g_ptr_array_index (ctx->ctx->statfiles, id); - g_assert (st != NULL); + id = g_array_index(ctx->statfiles_ids, gint, j); + st = g_ptr_array_index(ctx->ctx->statfiles, id); + g_assert(st != NULL); if (!!st->stcf->is_spam == !!is_spam) { if (incrementing) { @@ -533,18 +532,18 @@ bayes_learn_spam (struct rspamd_classifier * ctx, } if (tok->t1 && tok->t2) { - msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, " - "spam_count: %d, ham_count: %d", - tok->data, - (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, - (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, - tok->window_idx, total_cnt, spam_cnt, ham_cnt); + msg_debug_bayes("token %uL <%*s:%*s>: window: %d, total_count: %d, " + "spam_count: %d, ham_count: %d", + tok->data, + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, + tok->window_idx, total_cnt, spam_cnt, ham_cnt); } else { - msg_debug_bayes ("token %uL <?:?>: window: %d, total_count: %d, " - "spam_count: %d, ham_count: %d", - tok->data, - tok->window_idx, total_cnt, spam_cnt, ham_cnt); + msg_debug_bayes("token %uL <?:?>: window: %d, total_count: %d, " + "spam_count: %d, ham_count: %d", + tok->data, + tok->window_idx, total_cnt, spam_cnt, ham_cnt); } } diff --git a/src/libstat/classifiers/classifiers.h b/src/libstat/classifiers/classifiers.h index 32473cdd1..f6109c3e5 100644 --- a/src/libstat/classifiers/classifiers.h +++ b/src/libstat/classifiers/classifiers.h @@ -9,7 +9,7 @@ /* Consider this value as 0 */ #define ALPHA 0.0001 -#ifdef __cplusplus +#ifdef __cplusplus extern "C" { #endif @@ -23,66 +23,66 @@ struct token_node_s; struct rspamd_stat_classifier { char *name; - gboolean (*init_func) (struct rspamd_config *cfg, - struct ev_loop *ev_base, - struct rspamd_classifier *cl); + gboolean (*init_func)(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct rspamd_classifier *cl); - gboolean (*classify_func) (struct rspamd_classifier *ctx, - GPtrArray *tokens, - struct rspamd_task *task); + gboolean (*classify_func)(struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task); - gboolean (*learn_spam_func) (struct rspamd_classifier *ctx, - GPtrArray *input, - struct rspamd_task *task, - gboolean is_spam, - gboolean unlearn, - GError **err); + gboolean (*learn_spam_func)(struct rspamd_classifier *ctx, + GPtrArray *input, + struct rspamd_task *task, + gboolean is_spam, + gboolean unlearn, + GError **err); - void (*fin_func) (struct rspamd_classifier *cl); + void (*fin_func)(struct rspamd_classifier *cl); }; /* Bayes algorithm */ -gboolean bayes_init (struct rspamd_config *cfg, - struct ev_loop *ev_base, - struct rspamd_classifier *); +gboolean bayes_init(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct rspamd_classifier *); -gboolean bayes_classify (struct rspamd_classifier *ctx, - GPtrArray *tokens, - struct rspamd_task *task); +gboolean bayes_classify(struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task); -gboolean bayes_learn_spam (struct rspamd_classifier *ctx, - GPtrArray *tokens, - struct rspamd_task *task, - gboolean is_spam, - gboolean unlearn, - GError **err); +gboolean bayes_learn_spam(struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task, + gboolean is_spam, + gboolean unlearn, + GError **err); -void bayes_fin (struct rspamd_classifier *); +void bayes_fin(struct rspamd_classifier *); /* Generic lua classifier */ -gboolean lua_classifier_init (struct rspamd_config *cfg, - struct ev_loop *ev_base, - struct rspamd_classifier *); +gboolean lua_classifier_init(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct rspamd_classifier *); -gboolean lua_classifier_classify (struct rspamd_classifier *ctx, - GPtrArray *tokens, - struct rspamd_task *task); +gboolean lua_classifier_classify(struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task); -gboolean lua_classifier_learn_spam (struct rspamd_classifier *ctx, - GPtrArray *tokens, - struct rspamd_task *task, - gboolean is_spam, - gboolean unlearn, - GError **err); +gboolean lua_classifier_learn_spam(struct rspamd_classifier *ctx, + GPtrArray *tokens, + struct rspamd_task *task, + gboolean is_spam, + gboolean unlearn, + GError **err); extern gint rspamd_bayes_log_id; -#define msg_debug_bayes(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \ - rspamd_bayes_log_id, "bayes", task->task_pool->tag.uid, \ - G_STRFUNC, \ - __VA_ARGS__) +#define msg_debug_bayes(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ + rspamd_bayes_log_id, "bayes", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) -#ifdef __cplusplus +#ifdef __cplusplus } #endif diff --git a/src/libstat/classifiers/lua_classifier.c b/src/libstat/classifiers/lua_classifier.c index 41657abc0..b74330dca 100644 --- a/src/libstat/classifiers/lua_classifier.c +++ b/src/libstat/classifiers/lua_classifier.c @@ -27,108 +27,108 @@ struct rspamd_lua_classifier_ctx { static GHashTable *lua_classifiers = NULL; -#define msg_err_luacl(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \ - "luacl", task->task_pool->tag.uid, \ - RSPAMD_LOG_FUNC, \ - __VA_ARGS__) -#define msg_warn_luacl(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \ - "luacl", task->task_pool->tag.uid, \ - RSPAMD_LOG_FUNC, \ - __VA_ARGS__) -#define msg_info_luacl(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \ - "luacl", task->task_pool->tag.uid, \ - RSPAMD_LOG_FUNC, \ - __VA_ARGS__) -#define msg_debug_luacl(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \ - rspamd_luacl_log_id, "luacl", task->task_pool->tag.uid, \ - RSPAMD_LOG_FUNC, \ - __VA_ARGS__) +#define msg_err_luacl(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \ + "luacl", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_warn_luacl(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \ + "luacl", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_info_luacl(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \ + "luacl", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) +#define msg_debug_luacl(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \ + rspamd_luacl_log_id, "luacl", task->task_pool->tag.uid, \ + RSPAMD_LOG_FUNC, \ + __VA_ARGS__) INIT_LOG_MODULE(luacl) gboolean -lua_classifier_init (struct rspamd_config *cfg, - struct ev_loop *ev_base, - struct rspamd_classifier *cl) +lua_classifier_init(struct rspamd_config *cfg, + struct ev_loop *ev_base, + struct rspamd_classifier *cl) { struct rspamd_lua_classifier_ctx *ctx; lua_State *L = cl->ctx->cfg->lua_state; gint cb_classify = -1, cb_learn = -1; if (lua_classifiers == NULL) { - lua_classifiers = g_hash_table_new_full (rspamd_strcase_hash, - rspamd_strcase_equal, g_free, g_free); + lua_classifiers = g_hash_table_new_full(rspamd_strcase_hash, + rspamd_strcase_equal, g_free, g_free); } - ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name); + ctx = g_hash_table_lookup(lua_classifiers, cl->subrs->name); if (ctx != NULL) { - msg_err_config ("duplicate lua classifier definition: %s", - cl->subrs->name); + msg_err_config("duplicate lua classifier definition: %s", + cl->subrs->name); return FALSE; } - lua_getglobal (L, "rspamd_classifiers"); - if (lua_type (L, -1) != LUA_TTABLE) { - msg_err_config ("cannot register classifier %s: no rspamd_classifier global", - cl->subrs->name); - lua_pop (L, 1); + lua_getglobal(L, "rspamd_classifiers"); + if (lua_type(L, -1) != LUA_TTABLE) { + msg_err_config("cannot register classifier %s: no rspamd_classifier global", + cl->subrs->name); + lua_pop(L, 1); return FALSE; } - lua_pushstring (L, cl->subrs->name); - lua_gettable (L, -2); + lua_pushstring(L, cl->subrs->name); + lua_gettable(L, -2); - if (lua_type (L, -1) != LUA_TTABLE) { - msg_err_config ("cannot register classifier %s: bad lua type: %s", - cl->subrs->name, lua_typename (L, lua_type (L, -1))); - lua_pop (L, 2); + if (lua_type(L, -1) != LUA_TTABLE) { + msg_err_config("cannot register classifier %s: bad lua type: %s", + cl->subrs->name, lua_typename(L, lua_type(L, -1))); + lua_pop(L, 2); return FALSE; } - lua_pushstring (L, "classify"); - lua_gettable (L, -2); + lua_pushstring(L, "classify"); + lua_gettable(L, -2); - if (lua_type (L, -1) != LUA_TFUNCTION) { - msg_err_config ("cannot register classifier %s: bad lua type for classify: %s", - cl->subrs->name, lua_typename (L, lua_type (L, -1))); - lua_pop (L, 3); + if (lua_type(L, -1) != LUA_TFUNCTION) { + msg_err_config("cannot register classifier %s: bad lua type for classify: %s", + cl->subrs->name, lua_typename(L, lua_type(L, -1))); + lua_pop(L, 3); return FALSE; } - cb_classify = luaL_ref (L, LUA_REGISTRYINDEX); + cb_classify = luaL_ref(L, LUA_REGISTRYINDEX); - lua_pushstring (L, "learn"); - lua_gettable (L, -2); + lua_pushstring(L, "learn"); + lua_gettable(L, -2); - if (lua_type (L, -1) != LUA_TFUNCTION) { - msg_err_config ("cannot register classifier %s: bad lua type for learn: %s", - cl->subrs->name, lua_typename (L, lua_type (L, -1))); - lua_pop (L, 3); + if (lua_type(L, -1) != LUA_TFUNCTION) { + msg_err_config("cannot register classifier %s: bad lua type for learn: %s", + cl->subrs->name, lua_typename(L, lua_type(L, -1))); + lua_pop(L, 3); return FALSE; } - cb_learn = luaL_ref (L, LUA_REGISTRYINDEX); - lua_pop (L, 2); /* Table + global */ + cb_learn = luaL_ref(L, LUA_REGISTRYINDEX); + lua_pop(L, 2); /* Table + global */ - ctx = g_malloc0 (sizeof (*ctx)); - ctx->name = g_strdup (cl->subrs->name); + ctx = g_malloc0(sizeof(*ctx)); + ctx->name = g_strdup(cl->subrs->name); ctx->classify_ref = cb_classify; ctx->learn_ref = cb_learn; cl->cfg->flags |= RSPAMD_FLAG_CLASSIFIER_NO_BACKEND; - g_hash_table_insert (lua_classifiers, ctx->name, ctx); + g_hash_table_insert(lua_classifiers, ctx->name, ctx); return TRUE; } gboolean -lua_classifier_classify (struct rspamd_classifier *cl, - GPtrArray *tokens, - struct rspamd_task *task) +lua_classifier_classify(struct rspamd_classifier *cl, + GPtrArray *tokens, + struct rspamd_task *task) { struct rspamd_lua_classifier_ctx *ctx; struct rspamd_task **ptask; @@ -138,38 +138,38 @@ lua_classifier_classify (struct rspamd_classifier *cl, guint i; guint64 v; - ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name); - g_assert (ctx != NULL); + ctx = g_hash_table_lookup(lua_classifiers, cl->subrs->name); + g_assert(ctx != NULL); L = task->cfg->lua_state; - lua_rawgeti (L, LUA_REGISTRYINDEX, ctx->classify_ref); - ptask = lua_newuserdata (L, sizeof (*ptask)); + lua_rawgeti(L, LUA_REGISTRYINDEX, ctx->classify_ref); + ptask = lua_newuserdata(L, sizeof(*ptask)); *ptask = task; - rspamd_lua_setclass (L, "rspamd{task}", -1); - pcfg = lua_newuserdata (L, sizeof (*pcfg)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + pcfg = lua_newuserdata(L, sizeof(*pcfg)); *pcfg = cl->cfg; - rspamd_lua_setclass (L, "rspamd{classifier}", -1); + rspamd_lua_setclass(L, "rspamd{classifier}", -1); - lua_createtable (L, tokens->len, 0); + lua_createtable(L, tokens->len, 0); - for (i = 0; i < tokens->len; i ++) { - tok = g_ptr_array_index (tokens, i); + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index(tokens, i); v = tok->data; - lua_createtable (L, 3, 0); + lua_createtable(L, 3, 0); /* High word, low word, order */ - lua_pushinteger (L, (guint32)(v >> 32)); - lua_rawseti (L, -2, 1); - lua_pushinteger (L, (guint32)(v)); - lua_rawseti (L, -2, 2); - lua_pushinteger (L, tok->window_idx); - lua_rawseti (L, -2, 3); - lua_rawseti (L, -2, i + 1); + lua_pushinteger(L, (guint32) (v >> 32)); + lua_rawseti(L, -2, 1); + lua_pushinteger(L, (guint32) (v)); + lua_rawseti(L, -2, 2); + lua_pushinteger(L, tok->window_idx); + lua_rawseti(L, -2, 3); + lua_rawseti(L, -2, i + 1); } - if (lua_pcall (L, 3, 0, 0) != 0) { - msg_err_luacl ("error running classify function for %s: %s", ctx->name, - lua_tostring (L, -1)); - lua_pop (L, 1); + if (lua_pcall(L, 3, 0, 0) != 0) { + msg_err_luacl("error running classify function for %s: %s", ctx->name, + lua_tostring(L, -1)); + lua_pop(L, 1); return FALSE; } @@ -178,12 +178,12 @@ lua_classifier_classify (struct rspamd_classifier *cl, } gboolean -lua_classifier_learn_spam (struct rspamd_classifier *cl, - GPtrArray *tokens, - struct rspamd_task *task, - gboolean is_spam, - gboolean unlearn, - GError **err) +lua_classifier_learn_spam(struct rspamd_classifier *cl, + GPtrArray *tokens, + struct rspamd_task *task, + gboolean is_spam, + gboolean unlearn, + GError **err) { struct rspamd_lua_classifier_ctx *ctx; struct rspamd_task **ptask; @@ -193,42 +193,42 @@ lua_classifier_learn_spam (struct rspamd_classifier *cl, guint i; guint64 v; - ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name); - g_assert (ctx != NULL); + ctx = g_hash_table_lookup(lua_classifiers, cl->subrs->name); + g_assert(ctx != NULL); L = task->cfg->lua_state; - lua_rawgeti (L, LUA_REGISTRYINDEX, ctx->learn_ref); - ptask = lua_newuserdata (L, sizeof (*ptask)); + lua_rawgeti(L, LUA_REGISTRYINDEX, ctx->learn_ref); + ptask = lua_newuserdata(L, sizeof(*ptask)); *ptask = task; - rspamd_lua_setclass (L, "rspamd{task}", -1); - pcfg = lua_newuserdata (L, sizeof (*pcfg)); + rspamd_lua_setclass(L, "rspamd{task}", -1); + pcfg = lua_newuserdata(L, sizeof(*pcfg)); *pcfg = cl->cfg; - rspamd_lua_setclass (L, "rspamd{classifier}", -1); + rspamd_lua_setclass(L, "rspamd{classifier}", -1); - lua_createtable (L, tokens->len, 0); + lua_createtable(L, tokens->len, 0); - for (i = 0; i < tokens->len; i ++) { - tok = g_ptr_array_index (tokens, i); + for (i = 0; i < tokens->len; i++) { + tok = g_ptr_array_index(tokens, i); v = 0; v = tok->data; - lua_createtable (L, 3, 0); + lua_createtable(L, 3, 0); /* High word, low word, order */ - lua_pushinteger (L, (guint32)(v >> 32)); - lua_rawseti (L, -2, 1); - lua_pushinteger (L, (guint32)(v)); - lua_rawseti (L, -2, 2); - lua_pushinteger (L, tok->window_idx); - lua_rawseti (L, -2, 3); - lua_rawseti (L, -2, i + 1); + lua_pushinteger(L, (guint32) (v >> 32)); + lua_rawseti(L, -2, 1); + lua_pushinteger(L, (guint32) (v)); + lua_rawseti(L, -2, 2); + lua_pushinteger(L, tok->window_idx); + lua_rawseti(L, -2, 3); + lua_rawseti(L, -2, i + 1); } - lua_pushboolean (L, is_spam); - lua_pushboolean (L, unlearn); + lua_pushboolean(L, is_spam); + lua_pushboolean(L, unlearn); - if (lua_pcall (L, 5, 0, 0) != 0) { - msg_err_luacl ("error running learn function for %s: %s", ctx->name, - lua_tostring (L, -1)); - lua_pop (L, 1); + if (lua_pcall(L, 5, 0, 0) != 0) { + msg_err_luacl("error running learn function for %s: %s", ctx->name, + lua_tostring(L, -1)); + lua_pop(L, 1); return FALSE; } |