From e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 4 Apr 2017 16:49:44 +0100 Subject: [PATCH] [Rework] Set token data as uint64_t instead of chars array --- src/libstat/backends/mmaped_file.c | 8 +- src/libstat/backends/redis_backend.c | 6 +- src/libstat/backends/sqlite3_backend.c | 11 +-- src/libstat/classifiers/bayes.c | 63 +++++++++++++--- src/libstat/classifiers/lua_classifier.c | 5 +- src/libstat/learn_cache/redis_cache.c | 3 +- src/libstat/learn_cache/sqlite3_cache.c | 3 +- src/libstat/stat_internal.h | 4 +- src/libstat/stat_process.c | 93 +++++++++++++----------- src/libstat/tokenizers/osb.c | 8 +- src/libstat/tokenizers/tokenizers.c | 12 --- 11 files changed, 120 insertions(+), 96 deletions(-) diff --git a/src/libstat/backends/mmaped_file.c b/src/libstat/backends/mmaped_file.c index 50a635432..65ccb5aa1 100644 --- a/src/libstat/backends/mmaped_file.c +++ b/src/libstat/backends/mmaped_file.c @@ -963,8 +963,8 @@ rspamd_mmaped_file_process_tokens (struct rspamd_task *task, GPtrArray *tokens, for (i = 0; i < tokens->len; i++) { tok = g_ptr_array_index (tokens, i); - memcpy (&h1, tok->data, sizeof (h1)); - memcpy (&h2, tok->data + sizeof (h1), sizeof (h2)); + memcpy (&h1, (guchar *)&tok->data, sizeof (h1)); + memcpy (&h2, ((guchar *)&tok->data) + sizeof (h1), sizeof (h2)); tok->values[id] = rspamd_mmaped_file_get_block (mf, h1, h2); } @@ -993,8 +993,8 @@ rspamd_mmaped_file_learn_tokens (struct rspamd_task *task, GPtrArray *tokens, for (i = 0; i < tokens->len; i++) { tok = g_ptr_array_index (tokens, i); - memcpy (&h1, tok->data, sizeof (h1)); - memcpy (&h2, tok->data + sizeof (h1), sizeof (h2)); + memcpy (&h1, (guchar *)&tok->data, sizeof (h1)); + memcpy (&h2, ((guchar *)&tok->data) + sizeof (h1), sizeof (h2)); rspamd_mmaped_file_set_block (task->task_pool, mf, h1, h2, tok->values[id]); } diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index 869ecad0f..5c66c00a7 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -343,7 +343,6 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens, rspamd_token_t *tok; gchar n0[64], n1[64]; guint i, l0, l1, larg0, larg1; - guint64 num; g_assert (tokens != NULL); @@ -365,7 +364,6 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens, for (i = 0; i < tokens->len; i ++) { tok = g_ptr_array_index (tokens, i); - memcpy (&num, tok->data, sizeof (num)); if (learn) { rspamd_printf_fstring (&out, "" @@ -377,7 +375,7 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens, larg0, arg0, larg1, arg1); - l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", num); + l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", tok->data); if (intvals) { l1 = rspamd_snprintf (n1, sizeof (n1), "%L", @@ -395,7 +393,7 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens, "%s\r\n", l0, n0, l1, n1); } else { - l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", num); + l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", tok->data); rspamd_printf_fstring (&out, "" "$%d\r\n" "%s\r\n", l0, n0); diff --git a/src/libstat/backends/sqlite3_backend.c b/src/libstat/backends/sqlite3_backend.c index 2e49f8076..ec47b06a9 100644 --- a/src/libstat/backends/sqlite3_backend.c +++ b/src/libstat/backends/sqlite3_backend.c @@ -672,7 +672,7 @@ rspamd_sqlite3_process_tokens (struct rspamd_task *task, { struct rspamd_stat_sqlite3_db *bk; struct rspamd_stat_sqlite3_rt *rt = p; - gint64 iv = 0, idx; + gint64 iv = 0; guint i; rspamd_token_t *tok; @@ -714,11 +714,9 @@ rspamd_sqlite3_process_tokens (struct rspamd_task *task, } } - memcpy (&idx, tok->data, sizeof (idx)); - if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, RSPAMD_STAT_BACKEND_GET_TOKEN, - idx, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) { + tok->data, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) { tok->values[id] = iv; } else { @@ -765,7 +763,7 @@ rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens, { struct rspamd_stat_sqlite3_db *bk; struct rspamd_stat_sqlite3_rt *rt = p; - gint64 iv = 0, idx; + gint64 iv = 0; guint i; rspamd_token_t *tok; @@ -806,11 +804,10 @@ rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens, } iv = tok->values[id]; - memcpy (&idx, tok->data, sizeof (idx)); if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, RSPAMD_STAT_BACKEND_SET_TOKEN, - idx, rt->user_id, rt->lang_id, iv) != SQLITE_OK) { + tok->data, rt->user_id, rt->lang_id, iv) != SQLITE_OK) { rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt, RSPAMD_STAT_BACKEND_TRANSACTION_ROLLBACK); bk->in_transaction = FALSE; diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 95bd1f5ea..c9faae6bd 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -159,11 +159,12 @@ bayes_classify_token (struct rspamd_classifier *ctx, cl->processed_tokens ++; if (tok->t1 && tok->t2) { - msg_debug_bayes ("token <%*s:%*s>: weight: %f, total_count: %L, " + msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, " "spam_count: %L, ham_count: %L," "spam_prob: %.3f, ham_prob: %.3f, " "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " "current spam prob: %.3f, current ham prob: %.3f", + tok->data, (int) tok->t1->len, tok->t1->begin, (int) tok->t2->len, tok->t2->begin, fw, total_count, spam_count, ham_count, @@ -172,11 +173,12 @@ bayes_classify_token (struct rspamd_classifier *ctx, cl->spam_prob, cl->ham_prob); } else { - msg_debug_bayes ("token : weight: %f, total_count: %L, " + msg_debug_bayes ("token %uL : weight: %f, total_count: %L, " "spam_count: %L, ham_count: %L," "spam_prob: %.3f, ham_prob: %.3f, " "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, " "current spam prob: %.3f, current ham prob: %.3f", + tok->data, fw, total_count, spam_count, ham_count, spam_prob, ham_prob, bayes_spam_prob, bayes_ham_prob, @@ -324,7 +326,7 @@ bayes_learn_spam (struct rspamd_classifier * ctx, gboolean unlearn, GError **err) { - guint i, j; + guint i, j, total_cnt, spam_cnt, ham_cnt; gint id; struct rspamd_statfile *st; rspamd_token_t *tok; @@ -336,6 +338,9 @@ bayes_learn_spam (struct rspamd_classifier * ctx, incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND; for (i = 0; i < tokens->len; i++) { + total_cnt = 0; + spam_cnt = 0; + ham_cnt = 0; tok = g_ptr_array_index (tokens, i); for (j = 0; j < ctx->statfiles_ids->len; j++) { @@ -350,21 +355,55 @@ bayes_learn_spam (struct rspamd_classifier * ctx, else { tok->values[id]++; } - } - else if (tok->values[id] > 0 && unlearn) { - /* Unlearning */ - if (incrementing) { - tok->values[id] = -1; + + total_cnt += tok->values[id]; + + if (st->stcf->is_spam) { + spam_cnt += tok->values[id]; } else { - tok->values[id]--; + ham_cnt += tok->values[id]; } } - else if (incrementing) { - tok->values[id] = 0; + else { + if (tok->values[id] > 0 && unlearn) { + /* Unlearning */ + if (incrementing) { + tok->values[id] = -1; + } + else { + tok->values[id]--; + } + + if (st->stcf->is_spam) { + spam_cnt += tok->values[id]; + } + else { + ham_cnt += tok->values[id]; + } + total_cnt += tok->values[id]; + } + else if (incrementing) { + tok->values[id] = 0; + } } } + + if (tok->t1 && tok->t2) { + msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, " + "spam_count: %d, ham_count: %d", + tok->data, + (int) tok->t1->len, tok->t1->begin, + (int) tok->t2->len, tok->t2->begin, + tok->window_idx, total_cnt, spam_cnt, ham_cnt); + } + else { + msg_debug_bayes ("token %uL : window: %d, total_count: %d, " + "spam_count: %d, ham_count: %d", + tok->data, + tok->window_idx, total_cnt, spam_cnt, ham_cnt); + } } return TRUE; -} +} \ No newline at end of file diff --git a/src/libstat/classifiers/lua_classifier.c b/src/libstat/classifiers/lua_classifier.c index a28b58d84..823f689d5 100644 --- a/src/libstat/classifiers/lua_classifier.c +++ b/src/libstat/classifiers/lua_classifier.c @@ -151,8 +151,7 @@ lua_classifier_classify (struct rspamd_classifier *cl, for (i = 0; i < tokens->len; i ++) { tok = g_ptr_array_index (tokens, i); - v = 0; - memcpy (&v, tok->data, MIN (sizeof (v), tok->datalen)); + v = tok->data; lua_createtable (L, 3, 0); /* High word, low word, order */ lua_pushnumber (L, (guint32)(v >> 32)); @@ -208,7 +207,7 @@ lua_classifier_learn_spam (struct rspamd_classifier *cl, for (i = 0; i < tokens->len; i ++) { tok = g_ptr_array_index (tokens, i); v = 0; - memcpy (&v, tok->data, MIN (sizeof (v), tok->datalen)); + v = tok->data; lua_createtable (L, 3, 0); /* High word, low word, order */ lua_pushnumber (L, (guint32)(v >> 32)); diff --git a/src/libstat/learn_cache/redis_cache.c b/src/libstat/learn_cache/redis_cache.c index cc6e66469..14a90b9d1 100644 --- a/src/libstat/learn_cache/redis_cache.c +++ b/src/libstat/learn_cache/redis_cache.c @@ -197,7 +197,8 @@ rspamd_stat_cache_redis_generate_id (struct rspamd_task *task) for (i = 0; i < task->tokens->len; i ++) { tok = g_ptr_array_index (task->tokens, i); - rspamd_cryptobox_hash_update (&st, tok->data, tok->datalen); + rspamd_cryptobox_hash_update (&st, (guchar *)&tok->data, + sizeof (tok->data)); } rspamd_cryptobox_hash_final (&st, out); diff --git a/src/libstat/learn_cache/sqlite3_cache.c b/src/libstat/learn_cache/sqlite3_cache.c index 48cfe4af7..055950cf4 100644 --- a/src/libstat/learn_cache/sqlite3_cache.c +++ b/src/libstat/learn_cache/sqlite3_cache.c @@ -200,7 +200,8 @@ rspamd_stat_cache_sqlite3_check (struct rspamd_task *task, for (i = 0; i < task->tokens->len; i ++) { tok = g_ptr_array_index (task->tokens, i); - rspamd_cryptobox_hash_update (&st, tok->data, tok->datalen); + rspamd_cryptobox_hash_update (&st, (guchar *)&tok->data, + sizeof (tok->data)); } rspamd_cryptobox_hash_final (&st, out); diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h index 3290d5bc2..cab185f7c 100644 --- a/src/libstat/stat_internal.h +++ b/src/libstat/stat_internal.h @@ -51,11 +51,9 @@ struct rspamd_statfile { gpointer bkcf; }; -#define RSPAMD_MAX_TOKEN_LEN 8 typedef struct token_node_s { - guchar data[RSPAMD_MAX_TOKEN_LEN]; + guint64 data; guint window_idx; - guint datalen; guint flags; rspamd_stat_token_t *t1; rspamd_stat_token_t *t2; diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 6e4287abe..356b53807 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -176,50 +176,55 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, if (lua_type (L, -1) == LUA_TTABLE) { lua_pushstring (L, "stat_metatokens"); - lua_gettable (L, -1); - - if (lua_type (L, -1) == LUA_TFUNCTION) { - struct rspamd_task **ptask; - - ptask = lua_newuserdata (L, sizeof (*ptask)); - rspamd_lua_setclass (L, "rspamd{task}", -1); - *ptask = task; - - if (lua_pcall (L, 1, 1, 0) != 0) { - msg_err_task ("stat_metatokens failed: %s", - lua_tostring (L, -1)); - lua_pop (L, 1); - } - else { - /* Iterate over table of tables */ - for (lua_pushnil (L); lua_next (L, -2); lua_pop (L, 1)) { - elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META; - - if (lua_isnumber (L, -1)) { - gdouble num = lua_tonumber (L, -1); - guint8 *pnum = rspamd_mempool_alloc (task->task_pool, - sizeof (num)); - - msg_debug_task ("got metatoken number: %.2f", num); - memcpy (pnum, &num, sizeof (num)); - elt.begin = (gchar *) pnum; - elt.len = sizeof (num); - g_array_append_val (ar, elt); - } - else if (lua_isstring (L, -1)) { - const gchar *str; - gsize tlen; - - str = lua_tolstring (L, -1, &tlen); - guint8 *pstr = rspamd_mempool_alloc (task->task_pool, - tlen); - memcpy (pstr, str, tlen); - - msg_debug_task ("got metatoken string: %*s", - (gint)tlen, str); - elt.begin = (gchar *)pstr; - elt.len = tlen; - g_array_append_val (ar, elt); + lua_gettable (L, -2); + + if (lua_type (L, -1) == LUA_TTABLE) { + lua_pushstring (L, "callback"); + lua_gettable (L, -2); + + if (lua_type (L, -1) == LUA_TFUNCTION) { + struct rspamd_task **ptask; + + ptask = lua_newuserdata (L, sizeof (*ptask)); + rspamd_lua_setclass (L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall (L, 1, 1, 0) != 0) { + msg_err_task ("stat_metatokens failed: %s", + lua_tostring (L, -1)); + lua_pop (L, 1); + } else { + /* Iterate over table of tables */ + for (lua_pushnil (L); lua_next (L, -2); lua_pop (L, 1)) { + elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META; + + if (lua_isnumber (L, -1)) { + gdouble num = lua_tonumber (L, -1); + guint8 *pnum = rspamd_mempool_alloc ( + task->task_pool, + sizeof (num)); + + msg_debug_task ("got metatoken number: %.2f", num); + memcpy (pnum, &num, sizeof (num)); + elt.begin = (gchar *) pnum; + elt.len = sizeof (num); + g_array_append_val (ar, elt); + } else if (lua_isstring (L, -1)) { + const gchar *str; + gsize tlen; + + str = lua_tolstring (L, -1, &tlen); + guint8 *pstr = rspamd_mempool_alloc ( + task->task_pool, + tlen); + memcpy (pstr, str, tlen); + + msg_debug_task ("got metatoken string: %*s", + (gint) tlen, str); + elt.begin = (gchar *) pstr; + elt.len = tlen; + g_array_append_val (ar, elt); + } } } } diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index 8d1742cc1..6be660b41 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -324,7 +324,6 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, #define ADD_TOKEN do {\ new_tok = rspamd_mempool_alloc0 (pool, token_size); \ - new_tok->datalen = sizeof (gint64); \ new_tok->flags = token_flags; \ new_tok->t1 = hashpipe[0].t; \ new_tok->t2 = hashpipe[i].t; \ @@ -333,12 +332,11 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, ((guint32)hashpipe[i].h) * primes[i << 1]; \ h2 = ((guint32)hashpipe[0].h) * primes[1] + \ ((guint32)hashpipe[i].h) * primes[(i << 1) - 1]; \ - memcpy(new_tok->data, &h1, sizeof (h1)); \ - memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \ + memcpy((guchar *)&new_tok->data, &h1, sizeof (h1)); \ + memcpy(((guchar *)&new_tok->data) + sizeof (h1), &h2, sizeof (h2)); \ } \ else { \ - cur = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \ - memcpy (new_tok->data, &cur, sizeof (cur)); \ + new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \ } \ new_tok->window_idx = i + 1; \ g_ptr_array_add (result, new_tok); \ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 3280152f9..e165a4341 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -57,18 +57,6 @@ const gchar t_delimiters[255] = { 0, 0, 0, 0, 0 }; -gint -token_node_compare_func (gconstpointer a, gconstpointer b) -{ - const rspamd_token_t *aa = a, *bb = b; - - if (aa->datalen != bb->datalen) { - return aa->datalen - bb->datalen; - } - - return memcmp (aa->data, bb->data, aa->datalen); -} - /* Get next word from specified f_str_t buf */ static gboolean rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, -- 2.39.5