From 837d577363d856e4063fd6ab26caa338635a17e5 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 13 Jul 2016 17:02:48 +0100 Subject: [PATCH] [Fix] Switch hashes to mumhash --- src/libstat/tokenizers/tokenizers.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index e2ee4665b..4e0e4b75d 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -20,7 +20,7 @@ #include "rspamd.h" #include "tokenizers.h" #include "stat_internal.h" -#include "xxhash.h" +#include "../../../contrib/mumhash/mum.h" typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, rspamd_ftok_t * token, @@ -292,7 +292,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, token_get_function func; guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; guint64 hv = 0; - XXH64_state_t *st; gboolean decay = FALSE; guint64 prob; @@ -320,8 +319,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, } res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); - st = XXH64_createState (); - XXH64_reset (st, 0); while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || @@ -331,7 +328,15 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, } if (!decay) { - XXH64_update (st, token.begin, token.len); + if (token.len >= sizeof (guint64)) { +#ifdef _MUM_UNALIGNED_ACCESS + hv = mum_hash_step (hv, *(guint64 *)token.begin); +#else + guint64 tmp; + memcpy (&tmp, token.begin, sizeof (tmp)); + hv = mum_hash_step (hv, tmp); +#endif + } /* Check for decay */ if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) { @@ -339,7 +344,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, gdouble decay_prob; decay = TRUE; - hv = XXH64_digest (st); + hv = mum_hash_finish (hv); /* We assume that word is 6 symbols length in average */ decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0); @@ -368,15 +373,13 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, } if (!decay) { - hv = XXH64_digest (st); + hv = mum_hash_finish (hv); } if (hash) { *hash = hv; } - XXH64_freeState (st); - return res; } -- 2.39.5