aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers/tokenizers.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-13 17:02:48 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-13 17:07:23 +0100
commit837d577363d856e4063fd6ab26caa338635a17e5 (patch)
tree2491a75783afa1714f86cf712f264f6a3f2668ff /src/libstat/tokenizers/tokenizers.c
parent825645df65a4fcd3f7fef55f5b6ba8655dca594b (diff)
downloadrspamd-837d577363d856e4063fd6ab26caa338635a17e5.tar.gz
rspamd-837d577363d856e4063fd6ab26caa338635a17e5.zip
[Fix] Switch hashes to mumhash
Diffstat (limited to 'src/libstat/tokenizers/tokenizers.c')
-rw-r--r--src/libstat/tokenizers/tokenizers.c21
1 files changed, 12 insertions, 9 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index e2ee4665b..4e0e4b75d 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -20,7 +20,7 @@
#include "rspamd.h"
#include "tokenizers.h"
#include "stat_internal.h"
-#include "xxhash.h"
+#include "../../../contrib/mumhash/mum.h"
typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos,
rspamd_ftok_t * token,
@@ -292,7 +292,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
token_get_function func;
guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
guint64 hv = 0;
- XXH64_state_t *st;
gboolean decay = FALSE;
guint64 prob;
@@ -320,8 +319,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
}
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size);
- st = XXH64_createState ();
- XXH64_reset (st, 0);
while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) {
if (l == 0 || (min_len > 0 && l < min_len) ||
@@ -331,7 +328,15 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
}
if (!decay) {
- XXH64_update (st, token.begin, token.len);
+ if (token.len >= sizeof (guint64)) {
+#ifdef _MUM_UNALIGNED_ACCESS
+ hv = mum_hash_step (hv, *(guint64 *)token.begin);
+#else
+ guint64 tmp;
+ memcpy (&tmp, token.begin, sizeof (tmp));
+ hv = mum_hash_step (hv, tmp);
+#endif
+ }
/* Check for decay */
if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
@@ -339,7 +344,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
gdouble decay_prob;
decay = TRUE;
- hv = XXH64_digest (st);
+ hv = mum_hash_finish (hv);
/* We assume that word is 6 symbols length in average */
decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
@@ -368,15 +373,13 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
}
if (!decay) {
- hv = XXH64_digest (st);
+ hv = mum_hash_finish (hv);
}
if (hash) {
*hash = hv;
}
- XXH64_freeState (st);
-
return res;
}