aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/stat_process.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-23 14:29:31 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-23 14:29:31 +0000
commit21a12878cc50c97444c41886b23e418087922783 (patch)
treec1f74997ac28d4355ebf2eb0997b0e6e2f22770c /src/libstat/stat_process.c
parentfec137a7cccd626ce248f619011b2570f75438f8 (diff)
downloadrspamd-21a12878cc50c97444c41886b23e418087922783.tar.gz
rspamd-21a12878cc50c97444c41886b23e418087922783.zip
Rework tokenization:
- Use normalized words if needed - Allow using of seeded XXHash instead of hand-made legacy shit - Allow secure hashing using siphash
Diffstat (limited to 'src/libstat/stat_process.c')
-rw-r--r--src/libstat/stat_process.c23
1 files changed, 22 insertions, 1 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index eafbe2092..f5a4b9398 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -287,6 +287,20 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
GArray *words;
gchar *sub;
GList *cur;
+ const ucl_object_t *elt;
+ gboolean compat = TRUE;
+
+ /*
+ * XXX: Ugly repetition to be backward compatible
+ */
+ if (cf != NULL && cf->opts != NULL) {
+ elt = ucl_object_find_key (cf->opts, "hash");
+ if (elt != NULL && ucl_object_type (elt) == UCL_STRING) {
+ if (g_ascii_strcasecmp (ucl_object_tostring (elt), "xxh") == 0) {
+ compat = FALSE;
+ }
+ }
+ }
cur = task->text_parts;
@@ -297,8 +311,15 @@ rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
/*
* XXX: Use normalized words if needed here
*/
- tok->tokenizer->tokenize_func (cf, task->task_pool,
+
+ if (compat) {
+ tok->tokenizer->tokenize_func (cf, task->task_pool,
part->words, tok->tokens, part->is_utf);
+ }
+ else {
+ tok->tokenizer->tokenize_func (cf, task->task_pool,
+ part->normalized_words, tok->tokens, part->is_utf);
+ }
}
cur = g_list_next (cur);