aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/classifiers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2023-07-26 10:49:23 +0100
committerVsevolod Stakhov <vsevolod@rspamd.com>2023-07-26 10:49:23 +0100
commit537a7180a0d5132c11636c4fd8b1450cd99d352c (patch)
treefb9f8c84955a411bdffbd6371ea32f2716fb3687 /src/libstat/classifiers
parent5fd7a90fdaa33f52c59bdb0ca84451e5c1e22365 (diff)
downloadrspamd-537a7180a0d5132c11636c4fd8b1450cd99d352c.tar.gz
rspamd-537a7180a0d5132c11636c4fd8b1450cd99d352c.zip
[Rework] Use clang-format to unify formatting in all sources
No meaningful changes.
Diffstat (limited to 'src/libstat/classifiers')
-rw-r--r--src/libstat/classifiers/bayes.c341
-rw-r--r--src/libstat/classifiers/classifiers.h88
-rw-r--r--src/libstat/classifiers/lua_classifier.c216
3 files changed, 322 insertions, 323 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 6709bb75a..513db9af9 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -21,25 +21,25 @@
#include "stat_internal.h"
#include "math.h"
-#define msg_err_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
- "bayes", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
-#define msg_warn_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
- "bayes", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
-#define msg_info_bayes(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
- "bayes", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
+#define msg_err_bayes(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
+ "bayes", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_warn_bayes(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
+ "bayes", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_info_bayes(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
+ "bayes", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
INIT_LOG_MODULE_PUBLIC(bayes)
static inline GQuark
-bayes_error_quark (void)
+bayes_error_quark(void)
{
- return g_quark_from_static_string ("bayes-error");
+ return g_quark_from_static_string("bayes-error");
}
/**
@@ -50,21 +50,21 @@ bayes_error_quark (void)
* @return
*/
static gdouble
-inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg)
+inv_chi_square(struct rspamd_task *task, gdouble value, gint freedom_deg)
{
double prob, sum, m;
gint i;
errno = 0;
m = -value;
- prob = exp (value);
+ prob = exp(value);
if (errno == ERANGE) {
/*
* e^x where x is large *NEGATIVE* number is OK, so we have a very strong
* confidence that inv-chi-square is close to zero
*/
- msg_debug_bayes ("exp overflow");
+ msg_debug_bayes("exp overflow");
if (value < 0) {
return 0;
@@ -76,7 +76,7 @@ inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg)
sum = prob;
- msg_debug_bayes ("m: %f, probability: %g", m, prob);
+ msg_debug_bayes("m: %f, probability: %g", m, prob);
/*
* m is our confidence in class
@@ -85,12 +85,12 @@ inv_chi_square (struct rspamd_task *task, gdouble value, gint freedom_deg)
* from 1.0 (no confidence) to 0.0 (full confidence)
*/
for (i = 1; i < freedom_deg; i++) {
- prob *= m / (gdouble)i;
+ prob *= m / (gdouble) i;
sum += prob;
- msg_debug_bayes ("i=%d, probability: %g, sum: %g", i, prob, sum);
+ msg_debug_bayes("i=%d, probability: %g, sum: %g", i, prob, sum);
}
- return MIN (1.0, sum);
+ return MIN(1.0, sum);
}
struct bayes_task_closure {
@@ -107,15 +107,15 @@ struct bayes_task_closure {
* Mathematically we use pow(complexity, complexity), where complexity is the
* window index
*/
-static const double feature_weight[] = { 0, 3125, 256, 27, 1, 0, 0, 0 };
+static const double feature_weight[] = {0, 3125, 256, 27, 1, 0, 0, 0};
#define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)))
/*
* In this callback we calculate local probabilities for tokens
*/
static void
-bayes_classify_token (struct rspamd_classifier *ctx,
- rspamd_token_t *tok, struct bayes_task_closure *cl)
+bayes_classify_token(struct rspamd_classifier *ctx,
+ rspamd_token_t *tok, struct bayes_task_closure *cl)
{
guint i;
gint id;
@@ -136,15 +136,15 @@ bayes_classify_token (struct rspamd_classifier *ctx,
#endif
if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_META && cl->meta_skip_prob > 0) {
- val = rspamd_random_double_fast ();
+ val = rspamd_random_double_fast();
if (val <= cl->meta_skip_prob) {
if (tok->t1 && tok->t2) {
- msg_debug_bayes (
- "token(meta) %uL <%*s:%*s> probabilistically skipped",
- tok->data,
- (int) tok->t1->original.len, tok->t1->original.begin,
- (int) tok->t2->original.len, tok->t2->original.begin);
+ msg_debug_bayes(
+ "token(meta) %uL <%*s:%*s> probabilistically skipped",
+ tok->data,
+ (int) tok->t1->original.len, tok->t1->original.begin,
+ (int) tok->t2->original.len, tok->t2->original.begin);
}
return;
@@ -152,9 +152,9 @@ bayes_classify_token (struct rspamd_classifier *ctx,
}
for (i = 0; i < ctx->statfiles_ids->len; i++) {
- id = g_array_index (ctx->statfiles_ids, gint, i);
- st = g_ptr_array_index (ctx->ctx->statfiles, id);
- g_assert (st != NULL);
+ id = g_array_index(ctx->statfiles_ids, gint, i);
+ st = g_ptr_array_index(ctx->ctx->statfiles, id);
+ g_assert(st != NULL);
val = tok->values[id];
if (val > 0) {
@@ -172,8 +172,8 @@ bayes_classify_token (struct rspamd_classifier *ctx,
/* Probability for this token */
if (total_count >= ctx->cfg->min_token_hits) {
- spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns));
- ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
+ spam_freq = ((double) spam_count / MAX(1., (double) ctx->spam_learns));
+ ham_freq = ((double) ham_count / MAX(1., (double) ctx->ham_learns));
spam_prob = spam_freq / (spam_freq + ham_freq);
ham_prob = ham_freq / (spam_freq + ham_freq);
@@ -182,93 +182,91 @@ bayes_classify_token (struct rspamd_classifier *ctx,
}
else {
fw = feature_weight[tok->window_idx %
- G_N_ELEMENTS (feature_weight)];
+ G_N_ELEMENTS(feature_weight)];
}
w = (fw * total_count) / (1.0 + fw * total_count);
- bayes_spam_prob = PROB_COMBINE (spam_prob, total_count, w, 0.5);
+ bayes_spam_prob = PROB_COMBINE(spam_prob, total_count, w, 0.5);
if ((bayes_spam_prob > 0.5 && bayes_spam_prob < 0.5 + ctx->cfg->min_prob_strength) ||
(bayes_spam_prob < 0.5 && bayes_spam_prob > 0.5 - ctx->cfg->min_prob_strength)) {
- msg_debug_bayes (
- "token %uL <%*s:%*s> skipped, probability not in range: %f",
- tok->data,
- (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
- (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
- bayes_spam_prob);
+ msg_debug_bayes(
+ "token %uL <%*s:%*s> skipped, probability not in range: %f",
+ tok->data,
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
+ bayes_spam_prob);
return;
}
- bayes_ham_prob = PROB_COMBINE (ham_prob, total_count, w, 0.5);
+ bayes_ham_prob = PROB_COMBINE(ham_prob, total_count, w, 0.5);
- cl->spam_prob += log (bayes_spam_prob);
- cl->ham_prob += log (bayes_ham_prob);
- cl->processed_tokens ++;
+ cl->spam_prob += log(bayes_spam_prob);
+ cl->ham_prob += log(bayes_ham_prob);
+ cl->processed_tokens++;
if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) {
- cl->text_tokens ++;
+ cl->text_tokens++;
}
else {
token_type = "meta";
}
if (tok->t1 && tok->t2) {
- msg_debug_bayes ("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, "
- "total_count: %ud, "
- "spam_count: %ud, ham_count: %ud,"
- "spam_prob: %.3f, ham_prob: %.3f, "
- "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
- "current spam probability: %.3f, current ham probability: %.3f",
- token_type,
- tok->data,
- (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
- (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
- fw, w, total_count, spam_count, ham_count,
- spam_prob, ham_prob,
- bayes_spam_prob, bayes_ham_prob,
- cl->spam_prob, cl->ham_prob);
+ msg_debug_bayes("token(%s) %uL <%*s:%*s>: weight: %f, cf: %f, "
+ "total_count: %ud, "
+ "spam_count: %ud, ham_count: %ud,"
+ "spam_prob: %.3f, ham_prob: %.3f, "
+ "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
+ "current spam probability: %.3f, current ham probability: %.3f",
+ token_type,
+ tok->data,
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
+ fw, w, total_count, spam_count, ham_count,
+ spam_prob, ham_prob,
+ bayes_spam_prob, bayes_ham_prob,
+ cl->spam_prob, cl->ham_prob);
}
else {
- msg_debug_bayes ("token(%s) %uL <?:?>: weight: %f, cf: %f, "
- "total_count: %ud, "
- "spam_count: %ud, ham_count: %ud,"
- "spam_prob: %.3f, ham_prob: %.3f, "
- "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
- "current spam probability: %.3f, current ham probability: %.3f",
- token_type,
- tok->data,
- fw, w, total_count, spam_count, ham_count,
- spam_prob, ham_prob,
- bayes_spam_prob, bayes_ham_prob,
- cl->spam_prob, cl->ham_prob);
+ msg_debug_bayes("token(%s) %uL <?:?>: weight: %f, cf: %f, "
+ "total_count: %ud, "
+ "spam_count: %ud, ham_count: %ud,"
+ "spam_prob: %.3f, ham_prob: %.3f, "
+ "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
+ "current spam probability: %.3f, current ham probability: %.3f",
+ token_type,
+ tok->data,
+ fw, w, total_count, spam_count, ham_count,
+ spam_prob, ham_prob,
+ bayes_spam_prob, bayes_ham_prob,
+ cl->spam_prob, cl->ham_prob);
}
}
}
-
gboolean
-bayes_init (struct rspamd_config *cfg,
- struct ev_loop *ev_base,
- struct rspamd_classifier *cl)
+bayes_init(struct rspamd_config *cfg,
+ struct ev_loop *ev_base,
+ struct rspamd_classifier *cl)
{
cl->cfg->flags |= RSPAMD_FLAG_CLASSIFIER_INTEGER;
return TRUE;
}
-void
-bayes_fin (struct rspamd_classifier *cl)
+void bayes_fin(struct rspamd_classifier *cl)
{
}
gboolean
-bayes_classify (struct rspamd_classifier * ctx,
- GPtrArray *tokens,
- struct rspamd_task *task)
+bayes_classify(struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task)
{
double final_prob, h, s, *pprob;
gchar sumbuf[32];
@@ -278,41 +276,41 @@ bayes_classify (struct rspamd_classifier * ctx,
guint i, text_tokens = 0;
gint id;
- g_assert (ctx != NULL);
- g_assert (tokens != NULL);
+ g_assert(ctx != NULL);
+ g_assert(tokens != NULL);
- memset (&cl, 0, sizeof (cl));
+ memset(&cl, 0, sizeof(cl));
cl.task = task;
/* Check min learns */
if (ctx->cfg->min_learns > 0) {
if (ctx->ham_learns < ctx->cfg->min_learns) {
- msg_info_task ("not classified as ham. The ham class needs more "
- "training samples. Currently: %ul; minimum %ud required",
- ctx->ham_learns, ctx->cfg->min_learns);
+ msg_info_task("not classified as ham. The ham class needs more "
+ "training samples. Currently: %ul; minimum %ud required",
+ ctx->ham_learns, ctx->cfg->min_learns);
return TRUE;
}
if (ctx->spam_learns < ctx->cfg->min_learns) {
- msg_info_task ("not classified as spam. The spam class needs more "
- "training samples. Currently: %ul; minimum %ud required",
- ctx->spam_learns, ctx->cfg->min_learns);
+ msg_info_task("not classified as spam. The spam class needs more "
+ "training samples. Currently: %ul; minimum %ud required",
+ ctx->spam_learns, ctx->cfg->min_learns);
return TRUE;
}
}
- for (i = 0; i < tokens->len; i ++) {
- tok = g_ptr_array_index (tokens, i);
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index(tokens, i);
if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_META)) {
- text_tokens ++;
+ text_tokens++;
}
}
if (text_tokens == 0) {
- msg_info_task ("skipped classification as there are no text tokens. "
- "Total tokens: %ud",
- tokens->len);
+ msg_info_task("skipped classification as there are no text tokens. "
+ "Total tokens: %ud",
+ tokens->len);
return TRUE;
}
@@ -327,42 +325,42 @@ bayes_classify (struct rspamd_classifier * ctx,
cl.meta_skip_prob = 1.0 - text_tokens / tokens->len;
}
- for (i = 0; i < tokens->len; i ++) {
- tok = g_ptr_array_index (tokens, i);
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index(tokens, i);
- bayes_classify_token (ctx, tok, &cl);
+ bayes_classify_token(ctx, tok, &cl);
}
if (cl.processed_tokens == 0) {
- msg_info_bayes ("no tokens found in bayes database "
- "(%ud total tokens, %ud text tokens), ignore stats",
- tokens->len, text_tokens);
+ msg_info_bayes("no tokens found in bayes database "
+ "(%ud total tokens, %ud text tokens), ignore stats",
+ tokens->len, text_tokens);
return TRUE;
}
if (ctx->cfg->min_tokens > 0 &&
- cl.text_tokens < (gint)(ctx->cfg->min_tokens * 0.1)) {
- msg_info_bayes ("ignore bayes probability since we have "
- "found too few text tokens: %uL (of %ud checked), "
- "at least %d required",
- cl.text_tokens,
- text_tokens,
- (gint)(ctx->cfg->min_tokens * 0.1));
+ cl.text_tokens < (gint) (ctx->cfg->min_tokens * 0.1)) {
+ msg_info_bayes("ignore bayes probability since we have "
+ "found too few text tokens: %uL (of %ud checked), "
+ "at least %d required",
+ cl.text_tokens,
+ text_tokens,
+ (gint) (ctx->cfg->min_tokens * 0.1));
return TRUE;
}
if (cl.spam_prob > -300 && cl.ham_prob > -300) {
/* Fisher value is low enough to apply inv_chi_square */
- h = 1 - inv_chi_square (task, cl.spam_prob, cl.processed_tokens);
- s = 1 - inv_chi_square (task, cl.ham_prob, cl.processed_tokens);
+ h = 1 - inv_chi_square(task, cl.spam_prob, cl.processed_tokens);
+ s = 1 - inv_chi_square(task, cl.ham_prob, cl.processed_tokens);
}
else {
/* Use naive method */
if (cl.spam_prob < cl.ham_prob) {
h = (1.0 - exp(cl.spam_prob - cl.ham_prob)) /
- (1.0 + exp(cl.spam_prob - cl.ham_prob));
+ (1.0 + exp(cl.spam_prob - cl.ham_prob));
s = 1.0 - h;
}
else {
@@ -372,51 +370,51 @@ bayes_classify (struct rspamd_classifier * ctx,
}
}
- if (isfinite (s) && isfinite (h)) {
+ if (isfinite(s) && isfinite(h)) {
final_prob = (s + 1.0 - h) / 2.;
- msg_debug_bayes (
- "got ham probability %.2f -> %.2f and spam probability %.2f -> %.2f,"
- " %L tokens processed of %ud total tokens;"
- " %uL text tokens found of %ud text tokens)",
- cl.ham_prob,
- h,
- cl.spam_prob,
- s,
- cl.processed_tokens,
- tokens->len,
- cl.text_tokens,
- text_tokens);
+ msg_debug_bayes(
+ "got ham probability %.2f -> %.2f and spam probability %.2f -> %.2f,"
+ " %L tokens processed of %ud total tokens;"
+ " %uL text tokens found of %ud text tokens)",
+ cl.ham_prob,
+ h,
+ cl.spam_prob,
+ s,
+ cl.processed_tokens,
+ tokens->len,
+ cl.text_tokens,
+ text_tokens);
}
else {
/*
* We have some overflow, hence we need to check which class
* is NaN
*/
- if (isfinite (h)) {
+ if (isfinite(h)) {
final_prob = 1.0;
- msg_debug_bayes ("spam class is full: no"
- " ham samples");
+ msg_debug_bayes("spam class is full: no"
+ " ham samples");
}
- else if (isfinite (s)) {
+ else if (isfinite(s)) {
final_prob = 0.0;
- msg_debug_bayes ("ham class is full: no"
- " spam samples");
+ msg_debug_bayes("ham class is full: no"
+ " spam samples");
}
else {
final_prob = 0.5;
- msg_warn_bayes ("spam and ham classes are both full");
+ msg_warn_bayes("spam and ham classes are both full");
}
}
- pprob = rspamd_mempool_alloc (task->task_pool, sizeof (*pprob));
+ pprob = rspamd_mempool_alloc(task->task_pool, sizeof(*pprob));
*pprob = final_prob;
- rspamd_mempool_set_variable (task->task_pool, "bayes_prob", pprob, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "bayes_prob", pprob, NULL);
- if (cl.processed_tokens > 0 && fabs (final_prob - 0.5) > 0.05) {
+ if (cl.processed_tokens > 0 && fabs(final_prob - 0.5) > 0.05) {
/* Now we can have exactly one HAM and exactly one SPAM statfiles per classifier */
for (i = 0; i < ctx->statfiles_ids->len; i++) {
- id = g_array_index (ctx->statfiles_ids, gint, i);
- st = g_ptr_array_index (ctx->ctx->statfiles, id);
+ id = g_array_index(ctx->statfiles_ids, gint, i);
+ st = g_ptr_array_index(ctx->ctx->statfiles, id);
if (final_prob > 0.5 && st->stcf->is_spam) {
break;
@@ -435,14 +433,15 @@ bayes_classify (struct rspamd_classifier * ctx,
* Bayes p is from 0.5 to 1.0, but confidence is from 0 to 1, so
* we need to rescale it to display correctly
*/
- rspamd_snprintf (sumbuf, sizeof (sumbuf), "%.2f%%",
- (final_prob - 0.5) * 200.);
- final_prob = rspamd_normalize_probability (final_prob, 0.5);
- g_assert (st != NULL);
+ rspamd_snprintf(sumbuf, sizeof(sumbuf), "%.2f%%",
+ (final_prob - 0.5) * 200.);
+ final_prob = rspamd_normalize_probability(final_prob, 0.5);
+ g_assert(st != NULL);
if (final_prob > 1 || final_prob < 0) {
- msg_err_bayes ("internal error: probability %f is outside of the "
- "allowed range [0..1]", final_prob);
+ msg_err_bayes("internal error: probability %f is outside of the "
+ "allowed range [0..1]",
+ final_prob);
if (final_prob > 1) {
final_prob = 1.0;
@@ -452,22 +451,22 @@ bayes_classify (struct rspamd_classifier * ctx,
}
}
- rspamd_task_insert_result (task,
- st->stcf->symbol,
- final_prob,
- sumbuf);
+ rspamd_task_insert_result(task,
+ st->stcf->symbol,
+ final_prob,
+ sumbuf);
}
return TRUE;
}
gboolean
-bayes_learn_spam (struct rspamd_classifier * ctx,
- GPtrArray *tokens,
- struct rspamd_task *task,
- gboolean is_spam,
- gboolean unlearn,
- GError **err)
+bayes_learn_spam(struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ gboolean unlearn,
+ GError **err)
{
guint i, j, total_cnt, spam_cnt, ham_cnt;
gint id;
@@ -475,8 +474,8 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
rspamd_token_t *tok;
gboolean incrementing;
- g_assert (ctx != NULL);
- g_assert (tokens != NULL);
+ g_assert(ctx != NULL);
+ g_assert(tokens != NULL);
incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND;
@@ -484,12 +483,12 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
total_cnt = 0;
spam_cnt = 0;
ham_cnt = 0;
- tok = g_ptr_array_index (tokens, i);
+ tok = g_ptr_array_index(tokens, i);
for (j = 0; j < ctx->statfiles_ids->len; j++) {
- id = g_array_index (ctx->statfiles_ids, gint, j);
- st = g_ptr_array_index (ctx->ctx->statfiles, id);
- g_assert (st != NULL);
+ id = g_array_index(ctx->statfiles_ids, gint, j);
+ st = g_ptr_array_index(ctx->ctx->statfiles, id);
+ g_assert(st != NULL);
if (!!st->stcf->is_spam == !!is_spam) {
if (incrementing) {
@@ -533,18 +532,18 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
}
if (tok->t1 && tok->t2) {
- msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, "
- "spam_count: %d, ham_count: %d",
- tok->data,
- (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
- (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
- tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+ msg_debug_bayes("token %uL <%*s:%*s>: window: %d, total_count: %d, "
+ "spam_count: %d, ham_count: %d",
+ tok->data,
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
+ tok->window_idx, total_cnt, spam_cnt, ham_cnt);
}
else {
- msg_debug_bayes ("token %uL <?:?>: window: %d, total_count: %d, "
- "spam_count: %d, ham_count: %d",
- tok->data,
- tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+ msg_debug_bayes("token %uL <?:?>: window: %d, total_count: %d, "
+ "spam_count: %d, ham_count: %d",
+ tok->data,
+ tok->window_idx, total_cnt, spam_cnt, ham_cnt);
}
}
diff --git a/src/libstat/classifiers/classifiers.h b/src/libstat/classifiers/classifiers.h
index 32473cdd1..f6109c3e5 100644
--- a/src/libstat/classifiers/classifiers.h
+++ b/src/libstat/classifiers/classifiers.h
@@ -9,7 +9,7 @@
/* Consider this value as 0 */
#define ALPHA 0.0001
-#ifdef __cplusplus
+#ifdef __cplusplus
extern "C" {
#endif
@@ -23,66 +23,66 @@ struct token_node_s;
struct rspamd_stat_classifier {
char *name;
- gboolean (*init_func) (struct rspamd_config *cfg,
- struct ev_loop *ev_base,
- struct rspamd_classifier *cl);
+ gboolean (*init_func)(struct rspamd_config *cfg,
+ struct ev_loop *ev_base,
+ struct rspamd_classifier *cl);
- gboolean (*classify_func) (struct rspamd_classifier *ctx,
- GPtrArray *tokens,
- struct rspamd_task *task);
+ gboolean (*classify_func)(struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task);
- gboolean (*learn_spam_func) (struct rspamd_classifier *ctx,
- GPtrArray *input,
- struct rspamd_task *task,
- gboolean is_spam,
- gboolean unlearn,
- GError **err);
+ gboolean (*learn_spam_func)(struct rspamd_classifier *ctx,
+ GPtrArray *input,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ gboolean unlearn,
+ GError **err);
- void (*fin_func) (struct rspamd_classifier *cl);
+ void (*fin_func)(struct rspamd_classifier *cl);
};
/* Bayes algorithm */
-gboolean bayes_init (struct rspamd_config *cfg,
- struct ev_loop *ev_base,
- struct rspamd_classifier *);
+gboolean bayes_init(struct rspamd_config *cfg,
+ struct ev_loop *ev_base,
+ struct rspamd_classifier *);
-gboolean bayes_classify (struct rspamd_classifier *ctx,
- GPtrArray *tokens,
- struct rspamd_task *task);
+gboolean bayes_classify(struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task);
-gboolean bayes_learn_spam (struct rspamd_classifier *ctx,
- GPtrArray *tokens,
- struct rspamd_task *task,
- gboolean is_spam,
- gboolean unlearn,
- GError **err);
+gboolean bayes_learn_spam(struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ gboolean unlearn,
+ GError **err);
-void bayes_fin (struct rspamd_classifier *);
+void bayes_fin(struct rspamd_classifier *);
/* Generic lua classifier */
-gboolean lua_classifier_init (struct rspamd_config *cfg,
- struct ev_loop *ev_base,
- struct rspamd_classifier *);
+gboolean lua_classifier_init(struct rspamd_config *cfg,
+ struct ev_loop *ev_base,
+ struct rspamd_classifier *);
-gboolean lua_classifier_classify (struct rspamd_classifier *ctx,
- GPtrArray *tokens,
- struct rspamd_task *task);
+gboolean lua_classifier_classify(struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task);
-gboolean lua_classifier_learn_spam (struct rspamd_classifier *ctx,
- GPtrArray *tokens,
- struct rspamd_task *task,
- gboolean is_spam,
- gboolean unlearn,
- GError **err);
+gboolean lua_classifier_learn_spam(struct rspamd_classifier *ctx,
+ GPtrArray *tokens,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ gboolean unlearn,
+ GError **err);
extern gint rspamd_bayes_log_id;
-#define msg_debug_bayes(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
- rspamd_bayes_log_id, "bayes", task->task_pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
+#define msg_debug_bayes(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \
+ rspamd_bayes_log_id, "bayes", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
-#ifdef __cplusplus
+#ifdef __cplusplus
}
#endif
diff --git a/src/libstat/classifiers/lua_classifier.c b/src/libstat/classifiers/lua_classifier.c
index 41657abc0..b74330dca 100644
--- a/src/libstat/classifiers/lua_classifier.c
+++ b/src/libstat/classifiers/lua_classifier.c
@@ -27,108 +27,108 @@ struct rspamd_lua_classifier_ctx {
static GHashTable *lua_classifiers = NULL;
-#define msg_err_luacl(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
- "luacl", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
-#define msg_warn_luacl(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
- "luacl", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
-#define msg_info_luacl(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
- "luacl", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
-#define msg_debug_luacl(...) rspamd_conditional_debug_fast (NULL, task->from_addr, \
- rspamd_luacl_log_id, "luacl", task->task_pool->tag.uid, \
- RSPAMD_LOG_FUNC, \
- __VA_ARGS__)
+#define msg_err_luacl(...) rspamd_default_log_function(G_LOG_LEVEL_CRITICAL, \
+ "luacl", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_warn_luacl(...) rspamd_default_log_function(G_LOG_LEVEL_WARNING, \
+ "luacl", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_info_luacl(...) rspamd_default_log_function(G_LOG_LEVEL_INFO, \
+ "luacl", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
+#define msg_debug_luacl(...) rspamd_conditional_debug_fast(NULL, task->from_addr, \
+ rspamd_luacl_log_id, "luacl", task->task_pool->tag.uid, \
+ RSPAMD_LOG_FUNC, \
+ __VA_ARGS__)
INIT_LOG_MODULE(luacl)
gboolean
-lua_classifier_init (struct rspamd_config *cfg,
- struct ev_loop *ev_base,
- struct rspamd_classifier *cl)
+lua_classifier_init(struct rspamd_config *cfg,
+ struct ev_loop *ev_base,
+ struct rspamd_classifier *cl)
{
struct rspamd_lua_classifier_ctx *ctx;
lua_State *L = cl->ctx->cfg->lua_state;
gint cb_classify = -1, cb_learn = -1;
if (lua_classifiers == NULL) {
- lua_classifiers = g_hash_table_new_full (rspamd_strcase_hash,
- rspamd_strcase_equal, g_free, g_free);
+ lua_classifiers = g_hash_table_new_full(rspamd_strcase_hash,
+ rspamd_strcase_equal, g_free, g_free);
}
- ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name);
+ ctx = g_hash_table_lookup(lua_classifiers, cl->subrs->name);
if (ctx != NULL) {
- msg_err_config ("duplicate lua classifier definition: %s",
- cl->subrs->name);
+ msg_err_config("duplicate lua classifier definition: %s",
+ cl->subrs->name);
return FALSE;
}
- lua_getglobal (L, "rspamd_classifiers");
- if (lua_type (L, -1) != LUA_TTABLE) {
- msg_err_config ("cannot register classifier %s: no rspamd_classifier global",
- cl->subrs->name);
- lua_pop (L, 1);
+ lua_getglobal(L, "rspamd_classifiers");
+ if (lua_type(L, -1) != LUA_TTABLE) {
+ msg_err_config("cannot register classifier %s: no rspamd_classifier global",
+ cl->subrs->name);
+ lua_pop(L, 1);
return FALSE;
}
- lua_pushstring (L, cl->subrs->name);
- lua_gettable (L, -2);
+ lua_pushstring(L, cl->subrs->name);
+ lua_gettable(L, -2);
- if (lua_type (L, -1) != LUA_TTABLE) {
- msg_err_config ("cannot register classifier %s: bad lua type: %s",
- cl->subrs->name, lua_typename (L, lua_type (L, -1)));
- lua_pop (L, 2);
+ if (lua_type(L, -1) != LUA_TTABLE) {
+ msg_err_config("cannot register classifier %s: bad lua type: %s",
+ cl->subrs->name, lua_typename(L, lua_type(L, -1)));
+ lua_pop(L, 2);
return FALSE;
}
- lua_pushstring (L, "classify");
- lua_gettable (L, -2);
+ lua_pushstring(L, "classify");
+ lua_gettable(L, -2);
- if (lua_type (L, -1) != LUA_TFUNCTION) {
- msg_err_config ("cannot register classifier %s: bad lua type for classify: %s",
- cl->subrs->name, lua_typename (L, lua_type (L, -1)));
- lua_pop (L, 3);
+ if (lua_type(L, -1) != LUA_TFUNCTION) {
+ msg_err_config("cannot register classifier %s: bad lua type for classify: %s",
+ cl->subrs->name, lua_typename(L, lua_type(L, -1)));
+ lua_pop(L, 3);
return FALSE;
}
- cb_classify = luaL_ref (L, LUA_REGISTRYINDEX);
+ cb_classify = luaL_ref(L, LUA_REGISTRYINDEX);
- lua_pushstring (L, "learn");
- lua_gettable (L, -2);
+ lua_pushstring(L, "learn");
+ lua_gettable(L, -2);
- if (lua_type (L, -1) != LUA_TFUNCTION) {
- msg_err_config ("cannot register classifier %s: bad lua type for learn: %s",
- cl->subrs->name, lua_typename (L, lua_type (L, -1)));
- lua_pop (L, 3);
+ if (lua_type(L, -1) != LUA_TFUNCTION) {
+ msg_err_config("cannot register classifier %s: bad lua type for learn: %s",
+ cl->subrs->name, lua_typename(L, lua_type(L, -1)));
+ lua_pop(L, 3);
return FALSE;
}
- cb_learn = luaL_ref (L, LUA_REGISTRYINDEX);
- lua_pop (L, 2); /* Table + global */
+ cb_learn = luaL_ref(L, LUA_REGISTRYINDEX);
+ lua_pop(L, 2); /* Table + global */
- ctx = g_malloc0 (sizeof (*ctx));
- ctx->name = g_strdup (cl->subrs->name);
+ ctx = g_malloc0(sizeof(*ctx));
+ ctx->name = g_strdup(cl->subrs->name);
ctx->classify_ref = cb_classify;
ctx->learn_ref = cb_learn;
cl->cfg->flags |= RSPAMD_FLAG_CLASSIFIER_NO_BACKEND;
- g_hash_table_insert (lua_classifiers, ctx->name, ctx);
+ g_hash_table_insert(lua_classifiers, ctx->name, ctx);
return TRUE;
}
gboolean
-lua_classifier_classify (struct rspamd_classifier *cl,
- GPtrArray *tokens,
- struct rspamd_task *task)
+lua_classifier_classify(struct rspamd_classifier *cl,
+ GPtrArray *tokens,
+ struct rspamd_task *task)
{
struct rspamd_lua_classifier_ctx *ctx;
struct rspamd_task **ptask;
@@ -138,38 +138,38 @@ lua_classifier_classify (struct rspamd_classifier *cl,
guint i;
guint64 v;
- ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name);
- g_assert (ctx != NULL);
+ ctx = g_hash_table_lookup(lua_classifiers, cl->subrs->name);
+ g_assert(ctx != NULL);
L = task->cfg->lua_state;
- lua_rawgeti (L, LUA_REGISTRYINDEX, ctx->classify_ref);
- ptask = lua_newuserdata (L, sizeof (*ptask));
+ lua_rawgeti(L, LUA_REGISTRYINDEX, ctx->classify_ref);
+ ptask = lua_newuserdata(L, sizeof(*ptask));
*ptask = task;
- rspamd_lua_setclass (L, "rspamd{task}", -1);
- pcfg = lua_newuserdata (L, sizeof (*pcfg));
+ rspamd_lua_setclass(L, "rspamd{task}", -1);
+ pcfg = lua_newuserdata(L, sizeof(*pcfg));
*pcfg = cl->cfg;
- rspamd_lua_setclass (L, "rspamd{classifier}", -1);
+ rspamd_lua_setclass(L, "rspamd{classifier}", -1);
- lua_createtable (L, tokens->len, 0);
+ lua_createtable(L, tokens->len, 0);
- for (i = 0; i < tokens->len; i ++) {
- tok = g_ptr_array_index (tokens, i);
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index(tokens, i);
v = tok->data;
- lua_createtable (L, 3, 0);
+ lua_createtable(L, 3, 0);
/* High word, low word, order */
- lua_pushinteger (L, (guint32)(v >> 32));
- lua_rawseti (L, -2, 1);
- lua_pushinteger (L, (guint32)(v));
- lua_rawseti (L, -2, 2);
- lua_pushinteger (L, tok->window_idx);
- lua_rawseti (L, -2, 3);
- lua_rawseti (L, -2, i + 1);
+ lua_pushinteger(L, (guint32) (v >> 32));
+ lua_rawseti(L, -2, 1);
+ lua_pushinteger(L, (guint32) (v));
+ lua_rawseti(L, -2, 2);
+ lua_pushinteger(L, tok->window_idx);
+ lua_rawseti(L, -2, 3);
+ lua_rawseti(L, -2, i + 1);
}
- if (lua_pcall (L, 3, 0, 0) != 0) {
- msg_err_luacl ("error running classify function for %s: %s", ctx->name,
- lua_tostring (L, -1));
- lua_pop (L, 1);
+ if (lua_pcall(L, 3, 0, 0) != 0) {
+ msg_err_luacl("error running classify function for %s: %s", ctx->name,
+ lua_tostring(L, -1));
+ lua_pop(L, 1);
return FALSE;
}
@@ -178,12 +178,12 @@ lua_classifier_classify (struct rspamd_classifier *cl,
}
gboolean
-lua_classifier_learn_spam (struct rspamd_classifier *cl,
- GPtrArray *tokens,
- struct rspamd_task *task,
- gboolean is_spam,
- gboolean unlearn,
- GError **err)
+lua_classifier_learn_spam(struct rspamd_classifier *cl,
+ GPtrArray *tokens,
+ struct rspamd_task *task,
+ gboolean is_spam,
+ gboolean unlearn,
+ GError **err)
{
struct rspamd_lua_classifier_ctx *ctx;
struct rspamd_task **ptask;
@@ -193,42 +193,42 @@ lua_classifier_learn_spam (struct rspamd_classifier *cl,
guint i;
guint64 v;
- ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name);
- g_assert (ctx != NULL);
+ ctx = g_hash_table_lookup(lua_classifiers, cl->subrs->name);
+ g_assert(ctx != NULL);
L = task->cfg->lua_state;
- lua_rawgeti (L, LUA_REGISTRYINDEX, ctx->learn_ref);
- ptask = lua_newuserdata (L, sizeof (*ptask));
+ lua_rawgeti(L, LUA_REGISTRYINDEX, ctx->learn_ref);
+ ptask = lua_newuserdata(L, sizeof(*ptask));
*ptask = task;
- rspamd_lua_setclass (L, "rspamd{task}", -1);
- pcfg = lua_newuserdata (L, sizeof (*pcfg));
+ rspamd_lua_setclass(L, "rspamd{task}", -1);
+ pcfg = lua_newuserdata(L, sizeof(*pcfg));
*pcfg = cl->cfg;
- rspamd_lua_setclass (L, "rspamd{classifier}", -1);
+ rspamd_lua_setclass(L, "rspamd{classifier}", -1);
- lua_createtable (L, tokens->len, 0);
+ lua_createtable(L, tokens->len, 0);
- for (i = 0; i < tokens->len; i ++) {
- tok = g_ptr_array_index (tokens, i);
+ for (i = 0; i < tokens->len; i++) {
+ tok = g_ptr_array_index(tokens, i);
v = 0;
v = tok->data;
- lua_createtable (L, 3, 0);
+ lua_createtable(L, 3, 0);
/* High word, low word, order */
- lua_pushinteger (L, (guint32)(v >> 32));
- lua_rawseti (L, -2, 1);
- lua_pushinteger (L, (guint32)(v));
- lua_rawseti (L, -2, 2);
- lua_pushinteger (L, tok->window_idx);
- lua_rawseti (L, -2, 3);
- lua_rawseti (L, -2, i + 1);
+ lua_pushinteger(L, (guint32) (v >> 32));
+ lua_rawseti(L, -2, 1);
+ lua_pushinteger(L, (guint32) (v));
+ lua_rawseti(L, -2, 2);
+ lua_pushinteger(L, tok->window_idx);
+ lua_rawseti(L, -2, 3);
+ lua_rawseti(L, -2, i + 1);
}
- lua_pushboolean (L, is_spam);
- lua_pushboolean (L, unlearn);
+ lua_pushboolean(L, is_spam);
+ lua_pushboolean(L, unlearn);
- if (lua_pcall (L, 5, 0, 0) != 0) {
- msg_err_luacl ("error running learn function for %s: %s", ctx->name,
- lua_tostring (L, -1));
- lua_pop (L, 1);
+ if (lua_pcall(L, 5, 0, 0) != 0) {
+ msg_err_luacl("error running learn function for %s: %s", ctx->name,
+ lua_tostring(L, -1));
+ lua_pop(L, 1);
return FALSE;
}