diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-23 14:29:31 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-23 14:29:31 +0000 |
commit | 21a12878cc50c97444c41886b23e418087922783 (patch) | |
tree | c1f74997ac28d4355ebf2eb0997b0e6e2f22770c /src/libstat/stat_process.c | |
parent | fec137a7cccd626ce248f619011b2570f75438f8 (diff) | |
download | rspamd-21a12878cc50c97444c41886b23e418087922783.tar.gz rspamd-21a12878cc50c97444c41886b23e418087922783.zip |
Rework tokenization:
- Use normalized words if needed
- Allow using of seeded XXHash instead of hand-made legacy shit
- Allow secure hashing using siphash
Diffstat (limited to 'src/libstat/stat_process.c')
-rw-r--r-- | src/libstat/stat_process.c | 23 |
1 files changed, 22 insertions, 1 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index eafbe2092..f5a4b9398 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -287,6 +287,20 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, GArray *words; gchar *sub; GList *cur; + const ucl_object_t *elt; + gboolean compat = TRUE; + + /* + * XXX: Ugly repetition to be backward compatible + */ + if (cf != NULL && cf->opts != NULL) { + elt = ucl_object_find_key (cf->opts, "hash"); + if (elt != NULL && ucl_object_type (elt) == UCL_STRING) { + if (g_ascii_strcasecmp (ucl_object_tostring (elt), "xxh") == 0) { + compat = FALSE; + } + } + } cur = task->text_parts; @@ -297,8 +311,15 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf, /* * XXX: Use normalized words if needed here */ - tok->tokenizer->tokenize_func (cf, task->task_pool, + + if (compat) { + tok->tokenizer->tokenize_func (cf, task->task_pool, part->words, tok->tokens, part->is_utf); + } + else { + tok->tokenizer->tokenize_func (cf, task->task_pool, + part->normalized_words, tok->tokens, part->is_utf); + } } cur = g_list_next (cur); |