aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-25 17:34:08 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-25 17:34:08 +0000
commite180ef2ce601b4118dab29ab074712c0a58244e4 (patch)
treeb3da007cff3ce5964f86e868301fc9cb897ffb70 /src/libstat
parent63ef123b048d5f1f2f6a5d172be6dc1a2629e2d7 (diff)
downloadrspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.tar.gz
rspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.zip
[Project] Finish basic tasks in new unicode project
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/backends/redis_backend.c12
-rw-r--r--src/libstat/classifiers/bayes.c17
-rw-r--r--src/libstat/stat_process.c10
-rw-r--r--src/libstat/tokenizers/osb.c30
4 files changed, 43 insertions, 26 deletions
diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c
index 00441a7a6..b003d5a27 100644
--- a/src/libstat/backends/redis_backend.c
+++ b/src/libstat/backends/redis_backend.c
@@ -527,14 +527,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task,
"HSET %b_tokens %b %b:%b",
prefix, (size_t) prefix_len,
n0, (size_t) l0,
- tok->t1->begin, tok->t1->len,
- tok->t2->begin, tok->t2->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len,
+ tok->t2->stemmed.begin, tok->t2->stemmed.len);
} else if (tok->t1) {
redisAsyncCommand (rt->redis, NULL, NULL,
"HSET %b_tokens %b %b",
prefix, (size_t) prefix_len,
n0, (size_t) l0,
- tok->t1->begin, tok->t1->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len);
}
}
else {
@@ -548,14 +548,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task,
"HSET %b %s %b:%b",
n0, (size_t) l0,
"tokens",
- tok->t1->begin, tok->t1->len,
- tok->t2->begin, tok->t2->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len,
+ tok->t2->stemmed.begin, tok->t2->stemmed.len);
} else if (tok->t1) {
redisAsyncCommand (rt->redis, NULL, NULL,
"HSET %b %s %b",
n0, (size_t) l0,
"tokens",
- tok->t1->begin, tok->t1->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len);
}
}
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 934c8d941..2b0cf21e8 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -147,8 +147,8 @@ bayes_classify_token (struct rspamd_classifier *ctx,
msg_debug_bayes (
"token(meta) %uL <%*s:%*s> probabilistically skipped",
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin);
+ (int) tok->t1->original.len, tok->t1->original.begin,
+ (int) tok->t2->original.len, tok->t2->original.begin);
}
return;
@@ -199,8 +199,9 @@ bayes_classify_token (struct rspamd_classifier *ctx,
msg_debug_bayes (
"token %uL <%*s:%*s> skipped, prob not in range: %f",
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin, bayes_spam_prob);
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
+ bayes_spam_prob);
return;
}
@@ -227,8 +228,8 @@ bayes_classify_token (struct rspamd_classifier *ctx,
"current spam prob: %.3f, current ham prob: %.3f",
token_type,
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin,
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
fw, w, total_count, spam_count, ham_count,
spam_prob, ham_prob,
bayes_spam_prob, bayes_ham_prob,
@@ -541,8 +542,8 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, "
"spam_count: %d, ham_count: %d",
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin,
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
tok->window_idx, total_cnt, spam_cnt, ham_cnt);
}
else {
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 0465f0c3c..ed3f78fde 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -41,6 +41,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
lua_State *L = task->cfg->lua_state;
ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
+ memset (&elt, 0, sizeof (elt));
elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
if (st_ctx->lua_stat_tokens_ref != -1) {
@@ -82,8 +83,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
tok.begin = lua_tolstring (L, -1, &tok.len);
if (tok.begin && tok.len > 0) {
- elt.begin = rspamd_mempool_ftokdup (task->task_pool, &tok);
- elt.len = tok.len;
+ elt.original.begin =
+ rspamd_mempool_ftokdup (task->task_pool, &tok);
+ elt.original.len = tok.len;
+ elt.stemmed.begin = elt.original.begin;
+ elt.stemmed.len = elt.original.len;
+ elt.normalized.begin = elt.original.begin;
+ elt.normalized.len = elt.original.len;
g_array_append_val (ar, elt);
}
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c
index a19217a89..0b53f8af9 100644
--- a/src/libstat/tokenizers/osb.c
+++ b/src/libstat/tokenizers/osb.c
@@ -304,30 +304,40 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
for (w = 0; w < words->len; w ++) {
token = &g_array_index (words, rspamd_stat_token_t, w);
token_flags = token->flags;
+ const gchar *begin;
+ gsize len;
- if (task->lang_det) {
- if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
- /* Skip stop word */
- continue;
- }
+ if (token->flags &
+ (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD|RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
+ /* Skip stop/skipped words */
+ continue;
+ }
+
+ if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ begin = token->stemmed.begin;
+ len = token->stemmed.len;
+ }
+ else {
+ begin = token->original.begin;
+ len = token->original.len;
}
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
rspamd_ftok_t ftok;
- ftok.begin = token->begin;
- ftok.len = token->len;
+ ftok.begin = begin;
+ ftok.len = len;
cur = rspamd_fstrhash_lc (&ftok, is_utf);
}
else {
/* We know that the words are normalized */
if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) {
cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
- token->begin, token->len, osb_cf->seed);
+ begin, len, osb_cf->seed);
}
else {
- rspamd_cryptobox_siphash ((guchar *)&cur, token->begin,
- token->len, osb_cf->sk);
+ rspamd_cryptobox_siphash ((guchar *)&cur, begin,
+ len, osb_cf->sk);
if (prefix) {
cur ^= seed;