aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-04 16:49:44 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-04 16:49:44 +0100
commite737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11 (patch)
treeba2ee3b7d49e603476dbe15e52d4a8c93c30474b /src/libstat
parent90f44e8ee59515936df340d5bace8ce68f515870 (diff)
downloadrspamd-e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11.tar.gz
rspamd-e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11.zip
[Rework] Set token data as uint64_t instead of chars array
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/backends/mmaped_file.c8
-rw-r--r--src/libstat/backends/redis_backend.c6
-rw-r--r--src/libstat/backends/sqlite3_backend.c11
-rw-r--r--src/libstat/classifiers/bayes.c63
-rw-r--r--src/libstat/classifiers/lua_classifier.c5
-rw-r--r--src/libstat/learn_cache/redis_cache.c3
-rw-r--r--src/libstat/learn_cache/sqlite3_cache.c3
-rw-r--r--src/libstat/stat_internal.h4
-rw-r--r--src/libstat/stat_process.c93
-rw-r--r--src/libstat/tokenizers/osb.c8
-rw-r--r--src/libstat/tokenizers/tokenizers.c12
11 files changed, 120 insertions, 96 deletions
diff --git a/src/libstat/backends/mmaped_file.c b/src/libstat/backends/mmaped_file.c
index 50a635432..65ccb5aa1 100644
--- a/src/libstat/backends/mmaped_file.c
+++ b/src/libstat/backends/mmaped_file.c
@@ -963,8 +963,8 @@ rspamd_mmaped_file_process_tokens (struct rspamd_task *task, GPtrArray *tokens,
for (i = 0; i < tokens->len; i++) {
tok = g_ptr_array_index (tokens, i);
- memcpy (&h1, tok->data, sizeof (h1));
- memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
+ memcpy (&h1, (guchar *)&tok->data, sizeof (h1));
+ memcpy (&h2, ((guchar *)&tok->data) + sizeof (h1), sizeof (h2));
tok->values[id] = rspamd_mmaped_file_get_block (mf, h1, h2);
}
@@ -993,8 +993,8 @@ rspamd_mmaped_file_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
for (i = 0; i < tokens->len; i++) {
tok = g_ptr_array_index (tokens, i);
- memcpy (&h1, tok->data, sizeof (h1));
- memcpy (&h2, tok->data + sizeof (h1), sizeof (h2));
+ memcpy (&h1, (guchar *)&tok->data, sizeof (h1));
+ memcpy (&h2, ((guchar *)&tok->data) + sizeof (h1), sizeof (h2));
rspamd_mmaped_file_set_block (task->task_pool, mf, h1, h2,
tok->values[id]);
}
diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c
index 869ecad0f..5c66c00a7 100644
--- a/src/libstat/backends/redis_backend.c
+++ b/src/libstat/backends/redis_backend.c
@@ -343,7 +343,6 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
rspamd_token_t *tok;
gchar n0[64], n1[64];
guint i, l0, l1, larg0, larg1;
- guint64 num;
g_assert (tokens != NULL);
@@ -365,7 +364,6 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
for (i = 0; i < tokens->len; i ++) {
tok = g_ptr_array_index (tokens, i);
- memcpy (&num, tok->data, sizeof (num));
if (learn) {
rspamd_printf_fstring (&out, ""
@@ -377,7 +375,7 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
larg0, arg0,
larg1, arg1);
- l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", num);
+ l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", tok->data);
if (intvals) {
l1 = rspamd_snprintf (n1, sizeof (n1), "%L",
@@ -395,7 +393,7 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, GPtrArray *tokens,
"%s\r\n", l0, n0, l1, n1);
}
else {
- l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", num);
+ l0 = rspamd_snprintf (n0, sizeof (n0), "%uL", tok->data);
rspamd_printf_fstring (&out, ""
"$%d\r\n"
"%s\r\n", l0, n0);
diff --git a/src/libstat/backends/sqlite3_backend.c b/src/libstat/backends/sqlite3_backend.c
index 2e49f8076..ec47b06a9 100644
--- a/src/libstat/backends/sqlite3_backend.c
+++ b/src/libstat/backends/sqlite3_backend.c
@@ -672,7 +672,7 @@ rspamd_sqlite3_process_tokens (struct rspamd_task *task,
{
struct rspamd_stat_sqlite3_db *bk;
struct rspamd_stat_sqlite3_rt *rt = p;
- gint64 iv = 0, idx;
+ gint64 iv = 0;
guint i;
rspamd_token_t *tok;
@@ -714,11 +714,9 @@ rspamd_sqlite3_process_tokens (struct rspamd_task *task,
}
}
- memcpy (&idx, tok->data, sizeof (idx));
-
if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
RSPAMD_STAT_BACKEND_GET_TOKEN,
- idx, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) {
+ tok->data, rt->user_id, rt->lang_id, &iv) == SQLITE_OK) {
tok->values[id] = iv;
}
else {
@@ -765,7 +763,7 @@ rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
{
struct rspamd_stat_sqlite3_db *bk;
struct rspamd_stat_sqlite3_rt *rt = p;
- gint64 iv = 0, idx;
+ gint64 iv = 0;
guint i;
rspamd_token_t *tok;
@@ -806,11 +804,10 @@ rspamd_sqlite3_learn_tokens (struct rspamd_task *task, GPtrArray *tokens,
}
iv = tok->values[id];
- memcpy (&idx, tok->data, sizeof (idx));
if (rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
RSPAMD_STAT_BACKEND_SET_TOKEN,
- idx, rt->user_id, rt->lang_id, iv) != SQLITE_OK) {
+ tok->data, rt->user_id, rt->lang_id, iv) != SQLITE_OK) {
rspamd_sqlite3_run_prstmt (task->task_pool, bk->sqlite, bk->prstmt,
RSPAMD_STAT_BACKEND_TRANSACTION_ROLLBACK);
bk->in_transaction = FALSE;
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 95bd1f5ea..c9faae6bd 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -159,11 +159,12 @@ bayes_classify_token (struct rspamd_classifier *ctx,
cl->processed_tokens ++;
if (tok->t1 && tok->t2) {
- msg_debug_bayes ("token <%*s:%*s>: weight: %f, total_count: %L, "
+ msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, "
"spam_count: %L, ham_count: %L,"
"spam_prob: %.3f, ham_prob: %.3f, "
"bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
"current spam prob: %.3f, current ham prob: %.3f",
+ tok->data,
(int) tok->t1->len, tok->t1->begin,
(int) tok->t2->len, tok->t2->begin,
fw, total_count, spam_count, ham_count,
@@ -172,11 +173,12 @@ bayes_classify_token (struct rspamd_classifier *ctx,
cl->spam_prob, cl->ham_prob);
}
else {
- msg_debug_bayes ("token <?:?>: weight: %f, total_count: %L, "
+ msg_debug_bayes ("token %uL <?:?>: weight: %f, total_count: %L, "
"spam_count: %L, ham_count: %L,"
"spam_prob: %.3f, ham_prob: %.3f, "
"bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
"current spam prob: %.3f, current ham prob: %.3f",
+ tok->data,
fw, total_count, spam_count, ham_count,
spam_prob, ham_prob,
bayes_spam_prob, bayes_ham_prob,
@@ -324,7 +326,7 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
gboolean unlearn,
GError **err)
{
- guint i, j;
+ guint i, j, total_cnt, spam_cnt, ham_cnt;
gint id;
struct rspamd_statfile *st;
rspamd_token_t *tok;
@@ -336,6 +338,9 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND;
for (i = 0; i < tokens->len; i++) {
+ total_cnt = 0;
+ spam_cnt = 0;
+ ham_cnt = 0;
tok = g_ptr_array_index (tokens, i);
for (j = 0; j < ctx->statfiles_ids->len; j++) {
@@ -350,21 +355,55 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
else {
tok->values[id]++;
}
- }
- else if (tok->values[id] > 0 && unlearn) {
- /* Unlearning */
- if (incrementing) {
- tok->values[id] = -1;
+
+ total_cnt += tok->values[id];
+
+ if (st->stcf->is_spam) {
+ spam_cnt += tok->values[id];
}
else {
- tok->values[id]--;
+ ham_cnt += tok->values[id];
}
}
- else if (incrementing) {
- tok->values[id] = 0;
+ else {
+ if (tok->values[id] > 0 && unlearn) {
+ /* Unlearning */
+ if (incrementing) {
+ tok->values[id] = -1;
+ }
+ else {
+ tok->values[id]--;
+ }
+
+ if (st->stcf->is_spam) {
+ spam_cnt += tok->values[id];
+ }
+ else {
+ ham_cnt += tok->values[id];
+ }
+ total_cnt += tok->values[id];
+ }
+ else if (incrementing) {
+ tok->values[id] = 0;
+ }
}
}
+
+ if (tok->t1 && tok->t2) {
+ msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, "
+ "spam_count: %d, ham_count: %d",
+ tok->data,
+ (int) tok->t1->len, tok->t1->begin,
+ (int) tok->t2->len, tok->t2->begin,
+ tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+ }
+ else {
+ msg_debug_bayes ("token %uL <?:?>: window: %d, total_count: %d, "
+ "spam_count: %d, ham_count: %d",
+ tok->data,
+ tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+ }
}
return TRUE;
-}
+} \ No newline at end of file
diff --git a/src/libstat/classifiers/lua_classifier.c b/src/libstat/classifiers/lua_classifier.c
index a28b58d84..823f689d5 100644
--- a/src/libstat/classifiers/lua_classifier.c
+++ b/src/libstat/classifiers/lua_classifier.c
@@ -151,8 +151,7 @@ lua_classifier_classify (struct rspamd_classifier *cl,
for (i = 0; i < tokens->len; i ++) {
tok = g_ptr_array_index (tokens, i);
- v = 0;
- memcpy (&v, tok->data, MIN (sizeof (v), tok->datalen));
+ v = tok->data;
lua_createtable (L, 3, 0);
/* High word, low word, order */
lua_pushnumber (L, (guint32)(v >> 32));
@@ -208,7 +207,7 @@ lua_classifier_learn_spam (struct rspamd_classifier *cl,
for (i = 0; i < tokens->len; i ++) {
tok = g_ptr_array_index (tokens, i);
v = 0;
- memcpy (&v, tok->data, MIN (sizeof (v), tok->datalen));
+ v = tok->data;
lua_createtable (L, 3, 0);
/* High word, low word, order */
lua_pushnumber (L, (guint32)(v >> 32));
diff --git a/src/libstat/learn_cache/redis_cache.c b/src/libstat/learn_cache/redis_cache.c
index cc6e66469..14a90b9d1 100644
--- a/src/libstat/learn_cache/redis_cache.c
+++ b/src/libstat/learn_cache/redis_cache.c
@@ -197,7 +197,8 @@ rspamd_stat_cache_redis_generate_id (struct rspamd_task *task)
for (i = 0; i < task->tokens->len; i ++) {
tok = g_ptr_array_index (task->tokens, i);
- rspamd_cryptobox_hash_update (&st, tok->data, tok->datalen);
+ rspamd_cryptobox_hash_update (&st, (guchar *)&tok->data,
+ sizeof (tok->data));
}
rspamd_cryptobox_hash_final (&st, out);
diff --git a/src/libstat/learn_cache/sqlite3_cache.c b/src/libstat/learn_cache/sqlite3_cache.c
index 48cfe4af7..055950cf4 100644
--- a/src/libstat/learn_cache/sqlite3_cache.c
+++ b/src/libstat/learn_cache/sqlite3_cache.c
@@ -200,7 +200,8 @@ rspamd_stat_cache_sqlite3_check (struct rspamd_task *task,
for (i = 0; i < task->tokens->len; i ++) {
tok = g_ptr_array_index (task->tokens, i);
- rspamd_cryptobox_hash_update (&st, tok->data, tok->datalen);
+ rspamd_cryptobox_hash_update (&st, (guchar *)&tok->data,
+ sizeof (tok->data));
}
rspamd_cryptobox_hash_final (&st, out);
diff --git a/src/libstat/stat_internal.h b/src/libstat/stat_internal.h
index 3290d5bc2..cab185f7c 100644
--- a/src/libstat/stat_internal.h
+++ b/src/libstat/stat_internal.h
@@ -51,11 +51,9 @@ struct rspamd_statfile {
gpointer bkcf;
};
-#define RSPAMD_MAX_TOKEN_LEN 8
typedef struct token_node_s {
- guchar data[RSPAMD_MAX_TOKEN_LEN];
+ guint64 data;
guint window_idx;
- guint datalen;
guint flags;
rspamd_stat_token_t *t1;
rspamd_stat_token_t *t2;
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 6e4287abe..356b53807 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -176,50 +176,55 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
if (lua_type (L, -1) == LUA_TTABLE) {
lua_pushstring (L, "stat_metatokens");
- lua_gettable (L, -1);
-
- if (lua_type (L, -1) == LUA_TFUNCTION) {
- struct rspamd_task **ptask;
-
- ptask = lua_newuserdata (L, sizeof (*ptask));
- rspamd_lua_setclass (L, "rspamd{task}", -1);
- *ptask = task;
-
- if (lua_pcall (L, 1, 1, 0) != 0) {
- msg_err_task ("stat_metatokens failed: %s",
- lua_tostring (L, -1));
- lua_pop (L, 1);
- }
- else {
- /* Iterate over table of tables */
- for (lua_pushnil (L); lua_next (L, -2); lua_pop (L, 1)) {
- elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META;
-
- if (lua_isnumber (L, -1)) {
- gdouble num = lua_tonumber (L, -1);
- guint8 *pnum = rspamd_mempool_alloc (task->task_pool,
- sizeof (num));
-
- msg_debug_task ("got metatoken number: %.2f", num);
- memcpy (pnum, &num, sizeof (num));
- elt.begin = (gchar *) pnum;
- elt.len = sizeof (num);
- g_array_append_val (ar, elt);
- }
- else if (lua_isstring (L, -1)) {
- const gchar *str;
- gsize tlen;
-
- str = lua_tolstring (L, -1, &tlen);
- guint8 *pstr = rspamd_mempool_alloc (task->task_pool,
- tlen);
- memcpy (pstr, str, tlen);
-
- msg_debug_task ("got metatoken string: %*s",
- (gint)tlen, str);
- elt.begin = (gchar *)pstr;
- elt.len = tlen;
- g_array_append_val (ar, elt);
+ lua_gettable (L, -2);
+
+ if (lua_type (L, -1) == LUA_TTABLE) {
+ lua_pushstring (L, "callback");
+ lua_gettable (L, -2);
+
+ if (lua_type (L, -1) == LUA_TFUNCTION) {
+ struct rspamd_task **ptask;
+
+ ptask = lua_newuserdata (L, sizeof (*ptask));
+ rspamd_lua_setclass (L, "rspamd{task}", -1);
+ *ptask = task;
+
+ if (lua_pcall (L, 1, 1, 0) != 0) {
+ msg_err_task ("stat_metatokens failed: %s",
+ lua_tostring (L, -1));
+ lua_pop (L, 1);
+ } else {
+ /* Iterate over table of tables */
+ for (lua_pushnil (L); lua_next (L, -2); lua_pop (L, 1)) {
+ elt.flags |= RSPAMD_STAT_TOKEN_FLAG_LUA_META;
+
+ if (lua_isnumber (L, -1)) {
+ gdouble num = lua_tonumber (L, -1);
+ guint8 *pnum = rspamd_mempool_alloc (
+ task->task_pool,
+ sizeof (num));
+
+ msg_debug_task ("got metatoken number: %.2f", num);
+ memcpy (pnum, &num, sizeof (num));
+ elt.begin = (gchar *) pnum;
+ elt.len = sizeof (num);
+ g_array_append_val (ar, elt);
+ } else if (lua_isstring (L, -1)) {
+ const gchar *str;
+ gsize tlen;
+
+ str = lua_tolstring (L, -1, &tlen);
+ guint8 *pstr = rspamd_mempool_alloc (
+ task->task_pool,
+ tlen);
+ memcpy (pstr, str, tlen);
+
+ msg_debug_task ("got metatoken string: %*s",
+ (gint) tlen, str);
+ elt.begin = (gchar *) pstr;
+ elt.len = tlen;
+ g_array_append_val (ar, elt);
+ }
}
}
}
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index 8d1742cc1..6be660b41 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -324,7 +324,6 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
#define ADD_TOKEN do {\
new_tok = rspamd_mempool_alloc0 (pool, token_size); \
- new_tok->datalen = sizeof (gint64); \
new_tok->flags = token_flags; \
new_tok->t1 = hashpipe[0].t; \
new_tok->t2 = hashpipe[i].t; \
@@ -333,12 +332,11 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
((guint32)hashpipe[i].h) * primes[i << 1]; \
h2 = ((guint32)hashpipe[0].h) * primes[1] + \
((guint32)hashpipe[i].h) * primes[(i << 1) - 1]; \
- memcpy(new_tok->data, &h1, sizeof (h1)); \
- memcpy(new_tok->data + sizeof (h1), &h2, sizeof (h2)); \
+ memcpy((guchar *)&new_tok->data, &h1, sizeof (h1)); \
+ memcpy(((guchar *)&new_tok->data) + sizeof (h1), &h2, sizeof (h2)); \
} \
else { \
- cur = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \
- memcpy (new_tok->data, &cur, sizeof (cur)); \
+ new_tok->data = hashpipe[0].h * primes[0] + hashpipe[i].h * primes[i << 1]; \
} \
new_tok->window_idx = i + 1; \
g_ptr_array_add (result, new_tok); \
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 3280152f9..e165a4341 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -57,18 +57,6 @@ const gchar t_delimiters[255] = {
0, 0, 0, 0, 0
};
-gint
-token_node_compare_func (gconstpointer a, gconstpointer b)
-{
- const rspamd_token_t *aa = a, *bb = b;
-
- if (aa->datalen != bb->datalen) {
- return aa->datalen - bb->datalen;
- }
-
- return memcmp (aa->data, bb->data, aa->datalen);
-}
-
/* Get next word from specified f_str_t buf */
static gboolean
rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,