diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-07-13 17:02:48 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-07-13 17:07:23 +0100 |
commit | 837d577363d856e4063fd6ab26caa338635a17e5 (patch) | |
tree | 2491a75783afa1714f86cf712f264f6a3f2668ff /src/libstat/tokenizers | |
parent | 825645df65a4fcd3f7fef55f5b6ba8655dca594b (diff) | |
download | rspamd-837d577363d856e4063fd6ab26caa338635a17e5.tar.gz rspamd-837d577363d856e4063fd6ab26caa338635a17e5.zip |
[Fix] Switch hashes to mumhash
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 21 |
1 files changed, 12 insertions, 9 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index e2ee4665b..4e0e4b75d 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -20,7 +20,7 @@ #include "rspamd.h" #include "tokenizers.h" #include "stat_internal.h" -#include "xxhash.h" +#include "../../../contrib/mumhash/mum.h" typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, rspamd_ftok_t * token, @@ -292,7 +292,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, token_get_function func; guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; guint64 hv = 0; - XXH64_state_t *st; gboolean decay = FALSE; guint64 prob; @@ -320,8 +319,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, } res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); - st = XXH64_createState (); - XXH64_reset (st, 0); while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { if (l == 0 || (min_len > 0 && l < min_len) || @@ -331,7 +328,15 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, } if (!decay) { - XXH64_update (st, token.begin, token.len); + if (token.len >= sizeof (guint64)) { +#ifdef _MUM_UNALIGNED_ACCESS + hv = mum_hash_step (hv, *(guint64 *)token.begin); +#else + guint64 tmp; + memcpy (&tmp, token.begin, sizeof (tmp)); + hv = mum_hash_step (hv, tmp); +#endif + } /* Check for decay */ if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) { @@ -339,7 +344,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, gdouble decay_prob; decay = TRUE; - hv = XXH64_digest (st); + hv = mum_hash_finish (hv); /* We assume that word is 6 symbols length in average */ decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0); @@ -368,15 +373,13 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, } if (!decay) { - hv = XXH64_digest (st); + hv = mum_hash_finish (hv); } if (hash) { *hash = hv; } - XXH64_freeState (st); - return res; } |