[Rework] Set token data as uint64_t instead of chars array

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 4 Apr 2017 15:49:44 +0000 (16:49 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 4 Apr 2017 15:49:44 +0000 (16:49 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 4 Apr 2017 15:49:44 +0000 (16:49 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 4 Apr 2017 15:49:44 +0000 (16:49 +0100)
diff --git a/src/libstat/backends/mmaped_file.c b/src/libstat/backends/mmaped_file.c

index 50a635432bd9c1b539114a58b9c2690ebd631ac9..65ccb5aa1a5fbb688a988734f9b97f66e500f018 100644 (file)
--- a/src/libstat/backends/mmaped_file.c
+++ b/src/libstat/backends/mmaped_file.c
@@ -963,8 +963,8 @@ rspamd_mmaped_file_process_tokens (struct rspamd_task *task, GPtrArray *tokens,
  
         for (i = 0; i < tokens->len; i++) {
                 tok = g_ptr_array_index (tokens, i);
-               memcpy (&h1, tok->data, sizeof (h1));
-               memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
+               memcpy (&h1, (guchar *)&tok->data, sizeof (h1));
+               memcpy (&h2, ((guchar *)&tok->data) + sizeof (h1), sizeof (h2));
                 tok->values[id] = rspamd_mmaped_file_get_block (mf, h1, h2);
         }
  
@@ -993,8 +993,8 @@ rspamd_mmaped_file_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
  
         for (i = 0; i < tokens->len; i++) {
                 tok = g_ptr_array_index (tokens, i);
-               memcpy (&h1, tok->data, sizeof (h1));
-               memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
+               memcpy (&h1, (guchar *)&tok->data, sizeof (h1));
+               memcpy (&h2, ((guchar *)&tok->data) + sizeof (h1), sizeof (h2));
                 rspamd_mmaped_file_set_block (task->task_pool, mf, h1, h2,
                                 tok->values[id]);
         }
diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c

index 869ecad0ff880d4c49c99684da2f447c91c69cd6..5c66c00a76ec154af6d4448e5c20bfc5af6f5047 100644 (file)
--- a/src/libstat/backends/redis_backend.c
+++ b/src/libstat/backends/redis_backend.c
@@ -343,7 +343,6 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
         rspamd_token_t *tok;
         gchar n0[64], n1[64];
         guint i, l0, l1, larg0, larg1;
-       guint64 num;
  
         g_assert (tokens != NULL);
  
@@ -365,7 +364,6 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
  
         for (i = 0; i < tokens->len; i ++) {
                 tok = g_ptr_array_index (tokens, i);
-               memcpy (&num, tok->data, sizeof (num));
  
                 if (learn) {
                         rspamd_printf_fstring (&out, ""
@@ -377,7 +375,7 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
                                         larg0, arg0,
                                         larg1, arg1);
  
-                       l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", num);
+                       l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", tok->data);
  
                         if (intvals) {
                                 l1 = rspamd_snprintf (n1, sizeof (n1), "%L",
@@ -395,7 +393,7 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
                                         "%s\r\n", l0, n0, l1, n1);
                 }
                 else {
-                       l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", num);
+                       l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", tok->data);
                         rspamd_printf_fstring (&out, ""
                                         "$%d\r\n"
                                         "%s\r\n", l0, n0);
diff --git a/src/libstat/backends/sqlite3_backend.c b/src/libstat/backends/sqlite3_backend.c

index 2e49f80762ef53350e5123ad2df371e3221d19b0..ec47b06a922852c7c2aa2e105f12085bf3cd4194 100644 (file)
--- a/src/libstat/backends/sqlite3_backend.c
+++ b/src/libstat/backends/sqlite3_backend.c
@@ -672,7 +672,7 @@ rspamd_sqlite3_process_tokens (struct rspamd_task *task,
  {
         struct rspamd_stat_sqlite3_db *bk;
         struct rspamd_stat_sqlite3_rt *rt = p;
-       gint64 iv = 0, idx;
+       gint64 iv = 0;
         guint i;
         rspamd_token_t *tok;
  
@@ -714,11 +714,9 @@ rspamd_sqlite3_process_tokens (struct rspamd_task *task,
                         }
                 }
  
-               memcpy (&idx, tok->data, sizeof (idx));
-
                 if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
                                 RSPAMD_STAT_BACKEND_GET_TOKEN,
-                               idx, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) {
+                               tok->data, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) {
                         tok->values[id] = iv;
                 }
                 else {
@@ -765,7 +763,7 @@ rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
  {
         struct rspamd_stat_sqlite3_db *bk;
         struct rspamd_stat_sqlite3_rt *rt = p;
-       gint64 iv = 0, idx;
+       gint64 iv = 0;
         guint i;
         rspamd_token_t *tok;
  
@@ -806,11 +804,10 @@ rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
                 }
  
                 iv = tok->values[id];
-               memcpy (&idx, tok->data, sizeof (idx));
  
                 if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
                                 RSPAMD_STAT_BACKEND_SET_TOKEN,
-                               idx, rt->user_id, rt->lang_id, iv) != SQLITE_OK) {
+                               tok->data, rt->user_id, rt->lang_id, iv) != SQLITE_OK) {
                         rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
                                         RSPAMD_STAT_BACKEND_TRANSACTION_ROLLBACK);
                         bk->in_transaction = FALSE;
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c

index 95bd1f5ea854fc60e7ad930b62aaf8cb6d1649dd..c9faae6bd32280eb375cc40bcc437bc859e4e269 100644 (file)
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -159,11 +159,12 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                 cl->processed_tokens ++;
  
                 if (tok->t1 && tok->t2) {
-                       msg_debug_bayes ("token <%*s:%*s>: weight: %f, total_count: %L, "
+                       msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, "
                                         "spam_count: %L, ham_count: %L,"
                                         "spam_prob: %.3f, ham_prob: %.3f, "
                                         "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
                                         "current spam prob: %.3f, current ham prob: %.3f",
+                                       tok->data,
                                         (int) tok->t1->len, tok->t1->begin,
                                         (int) tok->t2->len, tok->t2->begin,
                                         fw, total_count, spam_count, ham_count,
@@ -172,11 +173,12 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                                         cl->spam_prob, cl->ham_prob);
                 }
                 else {
-                       msg_debug_bayes ("token <?:?>: weight: %f, total_count: %L, "
+                       msg_debug_bayes ("token %uL <?:?>: weight: %f, total_count: %L, "
                                         "spam_count: %L, ham_count: %L,"
                                         "spam_prob: %.3f, ham_prob: %.3f, "
                                         "bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
                                         "current spam prob: %.3f, current ham prob: %.3f",
+                                       tok->data,
                                         fw, total_count, spam_count, ham_count,
                                         spam_prob, ham_prob,
                                         bayes_spam_prob, bayes_ham_prob,
@@ -324,7 +326,7 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
                 gboolean unlearn,
                 GError **err)
  {
-       guint i, j;
+       guint i, j, total_cnt, spam_cnt, ham_cnt;
         gint id;
         struct rspamd_statfile *st;
         rspamd_token_t *tok;
@@ -336,6 +338,9 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
         incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND;
  
         for (i = 0; i < tokens->len; i++) {
+               total_cnt = 0;
+               spam_cnt = 0;
+               ham_cnt = 0;
                 tok = g_ptr_array_index (tokens, i);
  
                 for (j = 0; j < ctx->statfiles_ids->len; j++) {
@@ -350,21 +355,55 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
                                 else {
                                         tok->values[id]++;
                                 }
-                       }
-                       else if (tok->values[id] > 0 && unlearn) {
-                               /* Unlearning */
-                               if (incrementing) {
-                                       tok->values[id] = -1;
+
+                               total_cnt += tok->values[id];
+
+                               if (st->stcf->is_spam) {
+                                       spam_cnt += tok->values[id];
                                 }
                                 else {
-                                       tok->values[id]--;
+                                       ham_cnt += tok->values[id];
                                 }
                         }
-                       else if (incrementing) {
-                               tok->values[id] = 0;
+                       else {
+                               if (tok->values[id] > 0 && unlearn) {
+                                       /* Unlearning */
+                                       if (incrementing) {
+                                               tok->values[id] = -1;
+                                       }
+                                       else {
+                                               tok->values[id]--;
+                                       }
+
+                                       if (st->stcf->is_spam) {
+                                               spam_cnt += tok->values[id];
+                                       }
+                                       else {
+                                               ham_cnt += tok->values[id];
+                                       }
+                                       total_cnt += tok->values[id];
+                               }
+                               else if (incrementing) {
+                                       tok->values[id] = 0;
+                               }
                         }
                 }
+
+               if (tok->t1 && tok->t2) {
+                       msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, "
+                                       "spam_count: %d, ham_count: %d",
+                                       tok->data,
+                                       (int) tok->t1->len, tok->t1->begin,
+                                       (int) tok->t2->len, tok->t2->begin,
+                                       tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+               }
+               else {
+                       msg_debug_bayes ("token %uL <?:?>: window: %d, total_count: %d, "
+                                       "spam_count: %d, ham_count: %d",
+                                       tok->data,
+                                       tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+               }
         }
  
         return TRUE;
-}
+}
+\ No newline at end of file
diff --git a/src/libstat/classifiers/lua_classifier.c b/src/libstat/classifiers/lua_classifier.c

index a28b58d84f0af75ed5cfb2a1d415339dc7d59e85..823f689d5a6d4415c3d5afa8e43464bc0cbfd8cf 100644 (file)
--- a/src/libstat/classifiers/lua_classifier.c
+++ b/src/libstat/classifiers/lua_classifier.c
@@ -151,8 +151,7 @@ lua_classifier_classify (struct rspamd_classifier *cl,
  
         for (i = 0; i < tokens->len; i ++) {
                 tok = g_ptr_array_index (tokens, i);
-               v = 0;
-               memcpy (&v, tok->data, MIN (sizeof (v), tok->datalen));
+               v = tok->data;
                 lua_createtable (L, 3, 0);
                 /* High word, low word, order */
                 lua_pushnumber (L, (guint32)(v >> 32));
@@ -208,7 +207,7 @@ lua_classifier_learn_spam (struct rspamd_classifier *cl,
         for (i = 0; i < tokens->len; i ++) {
                 tok = g_ptr_array_index (tokens, i);
                 v = 0;
-               memcpy (&v, tok->data, MIN (sizeof (v), tok->datalen));
+               v = tok->data;
                 lua_createtable (L, 3, 0);
                 /* High word, low word, order */
                 lua_pushnumber (L, (guint32)(v >> 32));
diff --git a/src/libstat/learn_cache/redis_cache.c b/src/libstat/learn_cache/redis_cache.c

index cc6e66469e59f8128687a98151a3663814a8483e..14a90b9d1b315fa13bfb149fa82b8ea380a318a7 100644 (file)
--- a/src/libstat/learn_cache/redis_cache.c
+++ b/src/libstat/learn_cache/redis_cache.c
@@ -197,7 +197,8 @@ rspamd_stat_cache_redis_generate_id (struct rspamd_task *task)
  
         for (i = 0; i < task->tokens->len; i ++) {
                 tok = g_ptr_array_index (task->tokens, i);
-               rspamd_cryptobox_hash_update (&st, tok->data, tok->datalen);
+               rspamd_cryptobox_hash_update (&st, (guchar *)&tok->data,
+                               sizeof (tok->data));
         }
  
         rspamd_cryptobox_hash_final (&st, out);
diff --git a/src/libstat/learn_cache/sqlite3_cache.c b/src/libstat/learn_cache/sqlite3_cache.c

index 48cfe4af78a2f8f403975e95218e9a215926458e..055950cf43092541d281025259a123fac8c14931 100644 (file)
--- a/src/libstat/learn_cache/sqlite3_cache.c
+++ b/src/libstat/learn_cache/sqlite3_cache.c
@@ -200,7 +200,8 @@ rspamd_stat_cache_sqlite3_check (struct rspamd_task *task,
  
                 for (i = 0; i < task->tokens->len; i ++) {
                         tok = g_ptr_array_index (task->tokens, i);
-                       rspamd_cryptobox_hash_update (&st, tok->data, tok->datalen);
+                       rspamd_cryptobox_hash_update (&st, (guchar *)&tok->data,
+                                       sizeof (tok->data));
                 }
  
                 rspamd_cryptobox_hash_final (&st, out);
diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h

index 3290d5bc247a555fd5a5416897ce52e8ee97ee11..cab185f7c9fcef6171a6796cd9b833777acc20f5 100644 (file)
--- a/src/libstat/stat_internal.h
+++ b/src/libstat/stat_internal.h
@@ -51,11 +51,9 @@ struct rspamd_statfile {
         gpointer bkcf;
  };
  
-#define RSPAMD_MAX_TOKEN_LEN 8
  typedef struct token_node_s {
-       guchar data[RSPAMD_MAX_TOKEN_LEN];
+       guint64 data;
         guint window_idx;
-       guint datalen;
         guint flags;
         rspamd_stat_token_t *t1;
         rspamd_stat_token_t *t2;
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index 6e4287abe014de956da47444d0ec39a0b45ebd65..356b538070bee690535f3e43bd8d47745d7178c6 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -176,50 +176,55 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
  
         if (lua_type (L, -1) == LUA_TTABLE) {
                 lua_pushstring (L, "stat_metatokens");
-               lua_gettable (L, -1);
-
-               if (lua_type (L, -1) == LUA_TFUNCTION) {
-                       struct rspamd_task **ptask;
-
-                       ptask = lua_newuserdata (L, sizeof (*ptask));
-                       rspamd_lua_setclass (L, "rspamd{task}", -1);
-                       *ptask = task;
-
-                       if (lua_pcall (L, 1, 1, 0) != 0) {
-                               msg_err_task ("stat_metatokens failed: %s",
-                                               lua_tostring (L, -1));
-                               lua_pop (L, 1);
-                       }
-                       else {
-                               /* Iterate over table of tables */
-                               for (lua_pushnil (L); lua_next (L, -2); lua_pop (L, 1)) {
-                                       elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META;
-
-                                       if (lua_isnumber (L, -1)) {
-                                               gdouble num = lua_tonumber (L, -1);
-                                               guint8 *pnum = rspamd_mempool_alloc (task->task_pool,
-                                                               sizeof (num));
-
-                                               msg_debug_task ("got metatoken number: %.2f", num);
-                                               memcpy (pnum, &num, sizeof (num));
-                                               elt.begin = (gchar *) pnum;
-                                               elt.len = sizeof (num);
-                                               g_array_append_val (ar, elt);
-                                       }
-                                       else if (lua_isstring (L, -1)) {
-                                               const gchar *str;
-                                               gsize tlen;
-
-                                               str = lua_tolstring (L, -1, &tlen);
-                                               guint8 *pstr = rspamd_mempool_alloc (task->task_pool,
-                                                               tlen);
-                                               memcpy (pstr, str, tlen);
-
-                                               msg_debug_task ("got metatoken string: %*s",
-                                                               (gint)tlen, str);
-                                               elt.begin = (gchar *)pstr;
-                                               elt.len = tlen;
-                                               g_array_append_val (ar, elt);
+               lua_gettable (L, -2);
+
+               if (lua_type (L, -1) == LUA_TTABLE) {
+                       lua_pushstring (L, "callback");
+                       lua_gettable (L, -2);
+
+                       if (lua_type (L, -1) == LUA_TFUNCTION) {
+                               struct rspamd_task **ptask;
+
+                               ptask = lua_newuserdata (L, sizeof (*ptask));
+                               rspamd_lua_setclass (L, "rspamd{task}", -1);
+                               *ptask = task;
+
+                               if (lua_pcall (L, 1, 1, 0) != 0) {
+                                       msg_err_task ("stat_metatokens failed: %s",
+                                                       lua_tostring (L, -1));
+                                       lua_pop (L, 1);
+                               } else {
+                                       /* Iterate over table of tables */
+                                       for (lua_pushnil (L); lua_next (L, -2); lua_pop (L, 1)) {
+                                               elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META;
+
+                                               if (lua_isnumber (L, -1)) {
+                                                       gdouble num = lua_tonumber (L, -1);
+                                                       guint8 *pnum = rspamd_mempool_alloc (
+                                                                       task->task_pool,
+                                                                       sizeof (num));
+
+                                                       msg_debug_task ("got metatoken number: %.2f", num);
+                                                       memcpy (pnum, &num, sizeof (num));
+                                                       elt.begin = (gchar *) pnum;
+                                                       elt.len = sizeof (num);
+                                                       g_array_append_val (ar, elt);
+                                               } else if (lua_isstring (L, -1)) {
+                                                       const gchar *str;
+                                                       gsize tlen;
+
+                                                       str = lua_tolstring (L, -1, &tlen);
+                                                       guint8 *pstr = rspamd_mempool_alloc (
+                                                                       task->task_pool,
+                                                                       tlen);
+                                                       memcpy (pstr, str, tlen);
+
+                                                       msg_debug_task ("got metatoken string: %*s",
+                                                                       (gint) tlen, str);
+                                                       elt.begin = (gchar *) pstr;
+                                                       elt.len = tlen;
+                                                       g_array_append_val (ar, elt);
+                                               }
                                         }
                                 }
                         }
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c

index 8d1742cc189d61477ca75fd3db2f1faa1ca58ac3..6be660b41537f6c7973f1bfa39bf5364c5783447 100644 (file)
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -324,7 +324,6 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
  
  #define ADD_TOKEN do {\
      new_tok = rspamd_mempool_alloc0 (pool, token_size); \
-    new_tok->datalen = sizeof (gint64); \
      new_tok->flags = token_flags; \
      new_tok->t1 = hashpipe[0].t; \
      new_tok->t2 = hashpipe[i].t; \
@@ -333,12 +332,11 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
              ((guint32)hashpipe[i].h) * primes[i << 1]; \
          h2 = ((guint32)hashpipe[0].h) * primes[1] + \
              ((guint32)hashpipe[i].h) * primes[(i << 1) - 1]; \
-        memcpy(new_tok->data, &h1, sizeof (h1)); \
-        memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \
+        memcpy((guchar *)&new_tok->data, &h1, sizeof (h1)); \
+        memcpy(((guchar *)&new_tok->data) + sizeof (h1), &h2, sizeof (h2)); \
      } \
      else { \
-        cur = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \
-        memcpy (new_tok->data, &cur, sizeof (cur)); \
+        new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \
      } \
      new_tok->window_idx = i + 1; \
      g_ptr_array_add (result, new_tok); \
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c

index 3280152f97a2a42fa51d4f2b66a761ce348662f8..e165a4341487f6e2225112e55e1d97e7a54cd633 100644 (file)
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -57,18 +57,6 @@ const gchar t_delimiters[255] = {
         0, 0, 0, 0, 0
  };
  
-gint
-token_node_compare_func (gconstpointer a, gconstpointer b)
-{
-       const rspamd_token_t *aa = a, *bb = b;
-
-       if (aa->datalen != bb->datalen) {
-               return aa->datalen - bb->datalen;
-       }
-
-       return memcmp (aa->data, bb->data, aa->datalen);
-}
-
  /* Get next word from specified f_str_t buf */
  static gboolean
  rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 4 Apr 2017 15:49:44 +0000 (16:49 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 4 Apr 2017 15:49:44 +0000 (16:49 +0100)
src/libstat/backends/mmaped_file.c		patch \| blob \| history
src/libstat/backends/redis_backend.c		patch \| blob \| history
src/libstat/backends/sqlite3_backend.c		patch \| blob \| history
src/libstat/classifiers/bayes.c		patch \| blob \| history
src/libstat/classifiers/lua_classifier.c		patch \| blob \| history
src/libstat/learn_cache/redis_cache.c		patch \| blob \| history
src/libstat/learn_cache/sqlite3_cache.c		patch \| blob \| history
src/libstat/stat_internal.h		patch \| blob \| history
src/libstat/stat_process.c		patch \| blob \| history
src/libstat/tokenizers/osb.c		patch \| blob \| history
src/libstat/tokenizers/tokenizers.c		patch \| blob \| history