diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 17:34:08 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-25 17:34:08 +0000 |
commit | e180ef2ce601b4118dab29ab074712c0a58244e4 (patch) | |
tree | b3da007cff3ce5964f86e868301fc9cb897ffb70 /src/libstat/tokenizers/osb.c | |
parent | 63ef123b048d5f1f2f6a5d172be6dc1a2629e2d7 (diff) | |
download | rspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.tar.gz rspamd-e180ef2ce601b4118dab29ab074712c0a58244e4.zip |
[Project] Finish basic tasks in new unicode project
Diffstat (limited to 'src/libstat/tokenizers/osb.c')
-rw-r--r-- | src/libstat/tokenizers/osb.c | 30 |
1 files changed, 20 insertions, 10 deletions
diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index a19217a89..0b53f8af9 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -304,30 +304,40 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx, for (w = 0; w < words->len; w ++) { token = &g_array_index (words, rspamd_stat_token_t, w); token_flags = token->flags; + const gchar *begin; + gsize len; - if (task->lang_det) { - if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) { - /* Skip stop word */ - continue; - } + if (token->flags & + (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD|RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) { + /* Skip stop/skipped words */ + continue; + } + + if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + begin = token->stemmed.begin; + len = token->stemmed.len; + } + else { + begin = token->original.begin; + len = token->original.len; } if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) { rspamd_ftok_t ftok; - ftok.begin = token->begin; - ftok.len = token->len; + ftok.begin = begin; + ftok.len = len; cur = rspamd_fstrhash_lc (&ftok, is_utf); } else { /* We know that the words are normalized */ if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) { cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64, - token->begin, token->len, osb_cf->seed); + begin, len, osb_cf->seed); } else { - rspamd_cryptobox_siphash ((guchar *)&cur, token->begin, - token->len, osb_cf->sk); + rspamd_cryptobox_siphash ((guchar *)&cur, begin, + len, osb_cf->sk); if (prefix) { cur ^= seed; |