diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-26 21:43:52 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-26 21:43:52 +0100 |
commit | 277ffc972d161a0928bc07c2e5d110b00115744c (patch) | |
tree | 246b76ade650c124e987f9ff4a662ded8b23d07e /src/libstat | |
parent | e38b84cd48f8c465b94395a5641f85209ab22b68 (diff) | |
download | rspamd-277ffc972d161a0928bc07c2e5d110b00115744c.tar.gz rspamd-277ffc972d161a0928bc07c2e5d110b00115744c.zip |
Allow adding of prefix for tokenizers.
Diffstat (limited to 'src/libstat')
-rw-r--r-- | src/libstat/stat_process.c | 7 | ||||
-rw-r--r-- | src/libstat/tokenizers/osb.c | 16 | ||||
-rw-r--r-- | src/libstat/tokenizers/tokenizers.h | 6 |
3 files changed, 22 insertions, 7 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 536db9b16..458c42cad 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -318,11 +318,11 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, if (!IS_PART_EMPTY (part) && part->words != NULL) { if (compat) { tok->tokenizer->tokenize_func (cf, task->task_pool, - part->words, tok->tokens, IS_PART_UTF (part)); + part->words, tok->tokens, IS_PART_UTF (part), NULL); } else { tok->tokenizer->tokenize_func (cf, task->task_pool, - part->normalized_words, tok->tokens, IS_PART_UTF (part)); + part->normalized_words, tok->tokens, IS_PART_UTF (part), NULL); } } } @@ -342,7 +342,8 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, task->task_pool, words, tok->tokens, - TRUE); + TRUE, + "SUBJECT"); g_array_free (words, TRUE); } } diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index dd413a9b4..ae6eabb27 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -212,12 +212,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, rspamd_mempool_t * pool, GArray * input, GTree * tree, - gboolean is_utf) + gboolean is_utf, + const gchar *prefix) { rspamd_token_t *new = NULL; rspamd_fstring_t *token; struct rspamd_osb_tokenizer_config *osb_cf; - guint64 *hashpipe, cur; + guint64 *hashpipe, cur, seed; guint32 h1, h2; guint processed = 0, i, w, window_size; @@ -236,6 +237,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, window_size = osb_cf->window_size; + if (prefix) { + seed = XXH64 (prefix, strlen (prefix), osb_cf->seed); + } + else { + seed = osb_cf->seed; + } + hashpipe = g_alloca (window_size * sizeof (hashpipe[0])); memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0])); @@ -253,6 +261,10 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, else { rspamd_cryptobox_siphash ((guchar *)&cur, token->begin, token->len, osb_cf->sk); + + if (prefix) { + cur ^= seed; + } } } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 2c96b7cff..ef3fc8af0 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -19,7 +19,8 @@ struct rspamd_stat_tokenizer { rspamd_mempool_t *pool, GArray *words, GTree *result, - gboolean is_utf); + gboolean is_utf, + const gchar *prefix); }; /* Compare two token nodes */ @@ -36,7 +37,8 @@ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, rspamd_mempool_t *pool, GArray *input, GTree *tokens, - gboolean is_utf); + gboolean is_utf, + const gchar *prefix); gpointer rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf, gsize *len); |