diff options
-rw-r--r-- | lualib/redis_scripts/bayes_classify.lua | 6 | ||||
-rw-r--r-- | lualib/redis_scripts/bayes_learn.lua | 4 | ||||
-rw-r--r-- | lualib/redis_scripts/bayes_stat.lua | 0 | ||||
-rw-r--r-- | src/libstat/backends/redis_backend.cxx | 20 |
4 files changed, 16 insertions, 14 deletions
diff --git a/lualib/redis_scripts/bayes_classify.lua b/lualib/redis_scripts/bayes_classify.lua index c999609e5..9bef96f14 100644 --- a/lualib/redis_scripts/bayes_classify.lua +++ b/lualib/redis_scripts/bayes_classify.lua @@ -1,10 +1,9 @@ -- Lua script to perform bayes classification -- This script accepts the following parameters: -- key1 - prefix for bayes tokens (e.g. for per-user classification) --- key2 - set of tokens encoded in messagepack array of int64_t +-- key2 - set of tokens encoded in messagepack array of strings local prefix = KEYS[1] -local input_tokens = cmsgpack.unpack(KEYS[2]) local output_spam = {} local output_ham = {} @@ -17,8 +16,9 @@ local prefix_underscore = prefix .. '_' -- This optimisation will save a lot of space for sparse tokens, and in Bayes that assumption is normally held if learned_ham > 0 and learned_spam > 0 then + local input_tokens = cmsgpack.unpack(KEYS[2]) for i, token in ipairs(input_tokens) do - local token_data = redis.call('HMGET', prefix_underscore .. tostring(token), 'H', 'S') + local token_data = redis.call('HMGET', prefix_underscore .. token, 'H', 'S') if token_data then local ham_count = token_data[1] diff --git a/lualib/redis_scripts/bayes_learn.lua b/lualib/redis_scripts/bayes_learn.lua index 638254706..7536f6808 100644 --- a/lualib/redis_scripts/bayes_learn.lua +++ b/lualib/redis_scripts/bayes_learn.lua @@ -4,7 +4,7 @@ -- key2 - boolean is_spam -- key3 - string symbol -- key4 - boolean is_unlearn --- key5 - set of tokens encoded in messagepack array of int64_t +-- key5 - set of tokens encoded in messagepack array of strings local prefix = KEYS[1] local is_spam = KEYS[2] == 'true' and true or false @@ -21,5 +21,5 @@ redis.call('HSET', prefix, 'version', '2') -- new schema redis.call('HINCRBY', prefix, learned_key, is_unlearn and -1 or 1) -- increase or decrease learned count for _, token in ipairs(input_tokens) do - redis.call('HINCRBY', prefix_underscore .. tostring(token), hash_key, 1) + redis.call('HINCRBY', prefix_underscore .. token, hash_key, 1) end
\ No newline at end of file diff --git a/lualib/redis_scripts/bayes_stat.lua b/lualib/redis_scripts/bayes_stat.lua new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/lualib/redis_scripts/bayes_stat.lua diff --git a/src/libstat/backends/redis_backend.cxx b/src/libstat/backends/redis_backend.cxx index 342fa0273..0eddf26cb 100644 --- a/src/libstat/backends/redis_backend.cxx +++ b/src/libstat/backends/redis_backend.cxx @@ -657,13 +657,13 @@ void rspamd_redis_close(gpointer p) static char * rspamd_redis_serialize_tokens(struct rspamd_task *task, GPtrArray *tokens, gsize *ser_len) { - /* Each token is int64_t that requires 9 bytes + 4 bytes array len + 1 byte array magic */ - gsize req_len = tokens->len * 9 + 5, i; - gchar *buf, *p; + /* Each token is int64_t that requires 10 bytes (2 int32_t) + 4 bytes array len + 1 byte array magic */ + char max_int64_str[] = "18446744073709551615"; + auto req_len = tokens->len * sizeof(max_int64_str) + 5; rspamd_token_t *tok; - buf = (gchar *) rspamd_mempool_alloc(task->task_pool, req_len); - p = buf; + auto *buf = (gchar *) rspamd_mempool_alloc(task->task_pool, req_len); + auto *p = buf; /* Array */ *p++ = (gchar) 0xdd; @@ -673,13 +673,15 @@ rspamd_redis_serialize_tokens(struct rspamd_task *task, GPtrArray *tokens, gsize *p++ = (gchar) ((tokens->len >> 8) & 0xff); *p++ = (gchar) (tokens->len & 0xff); + int i; PTR_ARRAY_FOREACH(tokens, i, tok) { - *p++ = (gchar) 0xd3; + char numbuf[sizeof(max_int64_str)]; + auto r = rspamd_snprintf(numbuf, sizeof(numbuf), "%uL", tok->data); + *p++ = (gchar) ((r & 0xff) | 0xa0); - guint64 val = GUINT64_TO_BE(tok->data); - memcpy(p, &val, sizeof(val)); - p += sizeof(val); + memcpy(p, &numbuf, r); + p += r; } *ser_len = p - buf; |