|
|
@@ -20,7 +20,7 @@ |
|
|
|
#include "rspamd.h" |
|
|
|
#include "tokenizers.h" |
|
|
|
#include "stat_internal.h" |
|
|
|
#include "xxhash.h" |
|
|
|
#include "../../../contrib/mumhash/mum.h" |
|
|
|
|
|
|
|
typedef gboolean (*token_get_function) (rspamd_ftok_t * buf, gchar const **pos, |
|
|
|
rspamd_ftok_t * token, |
|
|
@@ -292,7 +292,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, |
|
|
|
token_get_function func; |
|
|
|
guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; |
|
|
|
guint64 hv = 0; |
|
|
|
XXH64_state_t *st; |
|
|
|
gboolean decay = FALSE; |
|
|
|
guint64 prob; |
|
|
|
|
|
|
@@ -320,8 +319,6 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, |
|
|
|
} |
|
|
|
|
|
|
|
res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_ftok_t), initial_size); |
|
|
|
st = XXH64_createState (); |
|
|
|
XXH64_reset (st, 0); |
|
|
|
|
|
|
|
while (func (&buf, &pos, &token, &cur, is_utf, &l, FALSE)) { |
|
|
|
if (l == 0 || (min_len > 0 && l < min_len) || |
|
|
@@ -331,7 +328,15 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, |
|
|
|
} |
|
|
|
|
|
|
|
if (!decay) { |
|
|
|
XXH64_update (st, token.begin, token.len); |
|
|
|
if (token.len >= sizeof (guint64)) { |
|
|
|
#ifdef _MUM_UNALIGNED_ACCESS |
|
|
|
hv = mum_hash_step (hv, *(guint64 *)token.begin); |
|
|
|
#else |
|
|
|
guint64 tmp; |
|
|
|
memcpy (&tmp, token.begin, sizeof (tmp)); |
|
|
|
hv = mum_hash_step (hv, tmp); |
|
|
|
#endif |
|
|
|
} |
|
|
|
|
|
|
|
/* Check for decay */ |
|
|
|
if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) { |
|
|
@@ -339,7 +344,7 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, |
|
|
|
gdouble decay_prob; |
|
|
|
|
|
|
|
decay = TRUE; |
|
|
|
hv = XXH64_digest (st); |
|
|
|
hv = mum_hash_finish (hv); |
|
|
|
|
|
|
|
/* We assume that word is 6 symbols length in average */ |
|
|
|
decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0); |
|
|
@@ -368,15 +373,13 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, |
|
|
|
} |
|
|
|
|
|
|
|
if (!decay) { |
|
|
|
hv = XXH64_digest (st); |
|
|
|
hv = mum_hash_finish (hv); |
|
|
|
} |
|
|
|
|
|
|
|
if (hash) { |
|
|
|
*hash = hv; |
|
|
|
} |
|
|
|
|
|
|
|
XXH64_freeState (st); |
|
|
|
|
|
|
|
return res; |
|
|
|
} |
|
|
|
|