diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 17:34:08 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 17:34:08 +0000 |
commit | e180ef2ce601b4118dab29ab074712c0a58244e4 (patch) | |
tree | b3da007cff3ce5964f86e868301fc9cb897ffb70 /src/libstat | |
parent | 63ef123b048d5f1f2f6a5d172be6dc1a2629e2d7 (diff) | |
download | rspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.tar.gz rspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.zip |
[Project] Finish basic tasks in new unicode project
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/backends/redis_backend.c | 12 | ||||
-rw-r--r-- | src/libstat/classifiers/bayes.c | 17 | ||||
-rw-r--r-- | src/libstat/stat_process.c | 10 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 30 |
4 files changed, 43 insertions, 26 deletions
diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index 00441a7a6..b003d5a27 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -527,14 +527,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b_tokens %b %b:%b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->begin, tok->t1->len, - tok->t2->begin, tok->t2->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len, + tok->t2->stemmed.begin, tok->t2->stemmed.len); } else if (tok->t1) { redisAsyncCommand (rt->redis, NULL, NULL, "HSET %b_tokens %b %b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->begin, tok->t1->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len); } } else { @@ -548,14 +548,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b %s %b:%b", n0, (size_t) l0, "tokens", - tok->t1->begin, tok->t1->len, - tok->t2->begin, tok->t2->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len, + tok->t2->stemmed.begin, tok->t2->stemmed.len); } else if (tok->t1) { redisAsyncCommand (rt->redis, NULL, NULL, "HSET %b %s %b", n0, (size_t) l0, "tokens", - tok->t1->begin, tok->t1->len); + tok->t1->stemmed.begin, tok->t1->stemmed.len); } } diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index 934c8d941..2b0cf21e8 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -147,8 +147,8 @@ bayes_classify_token (struct rspamd_classifier *ctx, msg_debug_bayes ( "token(meta) %uL <%*s:%*s> probabilistically skipped", tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin); + (int) tok->t1->original.len, tok->t1->original.begin, + (int) tok->t2->original.len, tok->t2->original.begin); } return; @@ -199,8 +199,9 @@ bayes_classify_token (struct rspamd_classifier *ctx, msg_debug_bayes ( "token %uL <%*s:%*s> skipped, prob not in range: %f", tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin, bayes_spam_prob); + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, + bayes_spam_prob); return; } @@ -227,8 +228,8 @@ bayes_classify_token (struct rspamd_classifier *ctx, "current spam prob: %.3f, current ham prob: %.3f", token_type, tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin, + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, fw, w, total_count, spam_count, ham_count, spam_prob, ham_prob, bayes_spam_prob, bayes_ham_prob, @@ -541,8 +542,8 @@ bayes_learn_spam (struct rspamd_classifier * ctx, msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, " "spam_count: %d, ham_count: %d", tok->data, - (int) tok->t1->len, tok->t1->begin, - (int) tok->t2->len, tok->t2->begin, + (int) tok->t1->stemmed.len, tok->t1->stemmed.begin, + (int) tok->t2->stemmed.len, tok->t2->stemmed.begin, tok->window_idx, total_cnt, spam_cnt, ham_cnt); } else { diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 0465f0c3c..ed3f78fde 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -41,6 +41,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, lua_State *L = task->cfg->lua_state; ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16); + memset (&elt, 0, sizeof (elt)); elt.flags = RSPAMD_STAT_TOKEN_FLAG_META; if (st_ctx->lua_stat_tokens_ref != -1) { @@ -82,8 +83,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, tok.begin = lua_tolstring (L, -1, &tok.len); if (tok.begin && tok.len > 0) { - elt.begin = rspamd_mempool_ftokdup (task->task_pool, &tok); - elt.len = tok.len; + elt.original.begin = + rspamd_mempool_ftokdup (task->task_pool, &tok); + elt.original.len = tok.len; + elt.stemmed.begin = elt.original.begin; + elt.stemmed.len = elt.original.len; + elt.normalized.begin = elt.original.begin; + elt.normalized.len = elt.original.len; g_array_append_val (ar, elt); } diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index a19217a89..0b53f8af9 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -304,30 +304,40 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, for (w = 0; w < words->len; w ++) { token = &g_array_index (words, rspamd_stat_token_t, w); token_flags = token->flags; + const gchar *begin; + gsize len; - if (task->lang_det) { - if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) { - /* Skip stop word */ - continue; - } + if (token->flags & + (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD|RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + /* Skip stop/skipped words */ + continue; + } + + if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + begin = token->stemmed.begin; + len = token->stemmed.len; + } + else { + begin = token->original.begin; + len = token->original.len; } if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { rspamd_ftok_t ftok; - ftok.begin = token->begin; - ftok.len = token->len; + ftok.begin = begin; + ftok.len = len; cur = rspamd_fstrhash_lc (&ftok, is_utf); } else { /* We know that the words are normalized */ if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) { cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, - token->begin, token->len, osb_cf->seed); + begin, len, osb_cf->seed); } else { - rspamd_cryptobox_siphash ((guchar *)&cur, token->begin, - token->len, osb_cf->sk); + rspamd_cryptobox_siphash ((guchar *)&cur, begin, + len, osb_cf->sk); if (prefix) { cur ^= seed; |