From: Vsevolod Stakhov Date: Sun, 26 Jul 2015 20:43:52 +0000 (+0100) Subject: Allow adding of prefix for tokenizers. X-Git-Tag: 1.0.0~285 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=277ffc972d161a0928bc07c2e5d110b00115744c;p=rspamd.git Allow adding of prefix for tokenizers. --- diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 536db9b16..458c42cad 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -318,11 +318,11 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, if (!IS_PART_EMPTY (part) && part->words != NULL) { if (compat) { tok->tokenizer->tokenize_func (cf, task->task_pool, - part->words, tok->tokens, IS_PART_UTF (part)); + part->words, tok->tokens, IS_PART_UTF (part), NULL); } else { tok->tokenizer->tokenize_func (cf, task->task_pool, - part->normalized_words, tok->tokens, IS_PART_UTF (part)); + part->normalized_words, tok->tokens, IS_PART_UTF (part), NULL); } } } @@ -342,7 +342,8 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, task->task_pool, words, tok->tokens, - TRUE); + TRUE, + "SUBJECT"); g_array_free (words, TRUE); } } diff --git a/src/libstat/tokenizers/osb.c b/src/libstat/tokenizers/osb.c index dd413a9b4..ae6eabb27 100644 --- a/src/libstat/tokenizers/osb.c +++ b/src/libstat/tokenizers/osb.c @@ -212,12 +212,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, rspamd_mempool_t * pool, GArray * input, GTree * tree, - gboolean is_utf) + gboolean is_utf, + const gchar *prefix) { rspamd_token_t *new = NULL; rspamd_fstring_t *token; struct rspamd_osb_tokenizer_config *osb_cf; - guint64 *hashpipe, cur; + guint64 *hashpipe, cur, seed; guint32 h1, h2; guint processed = 0, i, w, window_size; @@ -236,6 +237,13 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, window_size = osb_cf->window_size; + if (prefix) { + seed = XXH64 (prefix, strlen (prefix), osb_cf->seed); + } + else { + seed = osb_cf->seed; + } + hashpipe = g_alloca (window_size * sizeof (hashpipe[0])); memset (hashpipe, 0xfe, window_size * sizeof (hashpipe[0])); @@ -253,6 +261,10 @@ rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, else { rspamd_cryptobox_siphash ((guchar *)&cur, token->begin, token->len, osb_cf->sk); + + if (prefix) { + cur ^= seed; + } } } diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 2c96b7cff..ef3fc8af0 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -19,7 +19,8 @@ struct rspamd_stat_tokenizer { rspamd_mempool_t *pool, GArray *words, GTree *result, - gboolean is_utf); + gboolean is_utf, + const gchar *prefix); }; /* Compare two token nodes */ @@ -36,7 +37,8 @@ gint rspamd_tokenizer_osb (struct rspamd_tokenizer_config *cf, rspamd_mempool_t *pool, GArray *input, GTree *tokens, - gboolean is_utf); + gboolean is_utf, + const gchar *prefix); gpointer rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf, gsize *len);