diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-05-11 15:54:47 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-05-11 15:54:47 +0100 |
commit | 4932b636ef23b7b0c9c356c9144457039253945e (patch) | |
tree | 0f086aac6c37acc408a85fcfd45800144b63574c /src/libutil/shingles.c | |
parent | bcb5eaadd0b810b399fbdb133253bbc9ec7f060e (diff) | |
download | rspamd-4932b636ef23b7b0c9c356c9144457039253945e.tar.gz rspamd-4932b636ef23b7b0c9c356c9144457039253945e.zip |
[Feature] Add more algorithms for shingles generation
Diffstat (limited to 'src/libutil/shingles.c')
-rw-r--r-- | src/libutil/shingles.c | 74 |
1 files changed, 59 insertions, 15 deletions
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index c2acae6d3..04a9975bc 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -24,7 +24,8 @@ rspamd_shingles_generate (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, - gpointer filterd) + gpointer filterd, + enum rspamd_shingle_alg alg) { struct rspamd_shingle *res; GArray *hashes[RSPAMD_SHINGLE_SIZE]; @@ -35,7 +36,8 @@ rspamd_shingles_generate (GArray *input, rspamd_ftok_t *word; rspamd_cryptobox_hash_state_t bs; guint64 val; - gint i, j, beg = 0; + gint i, j, k, beg = 0; + enum rspamd_cryptobox_fast_hash_type ht; if (pool != NULL) { res = rspamd_mempool_alloc (pool, sizeof (*res)); @@ -71,22 +73,64 @@ rspamd_shingles_generate (GArray *input, } /* Now parse input words into a vector of hashes using rolling window */ - for (i = 0; i <= (gint)input->len; i ++) { - if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { - for (j = beg; j < i; j ++) { - word = &g_array_index (input, rspamd_ftok_t, j); - row = rspamd_fstring_append (row, word->begin, word->len); + if (alg == RSPAMD_SHINGLES_OLD) { + for (i = 0; i <= (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { + for (j = beg; j < i; j ++) { + word = &g_array_index (input, rspamd_ftok_t, j); + row = rspamd_fstring_append (row, word->begin, word->len); + } + beg++; + + /* Now we need to create a new row here */ + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, + keys[j]); + g_array_append_val (hashes[j], val); + } + + row = rspamd_fstring_assign (row, "", 0); } - beg++; + } + } + else { + guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE]; - /* Now we need to create a new row here */ - for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { - rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, - keys[j]); - g_array_append_val (hashes[j], val); - } + if (alg == RSPAMD_SHINGLES_XXHASH) { + ht = RSPAMD_CRYPTOBOX_XXHASH64; + } + else { + ht = RSPAMD_CRYPTOBOX_MUMHASH; + } - row = rspamd_fstring_assign (row, "", 0); + memset (res, 0, sizeof (res)); + + for (i = 0; i <= (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { + + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + /* Shift hashes window to right */ + for (k = 0; k < SHINGLES_WINDOW - 1; k ++) { + res[j * SHINGLES_WINDOW + k] = + res[j * SHINGLES_WINDOW + k + 1]; + } + + word = &g_array_index (input, rspamd_ftok_t, beg); + /* Insert the last element to the pipe */ + res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = + rspamd_cryptobox_fast_hash_specific (ht, + word->begin, word->len, + *(guint64 *)keys[j]); + + val = 0; + for (k = 0; k < SHINGLES_WINDOW; k ++) { + val ^= res[j * SHINGLES_WINDOW + k]; + } + + g_array_append_val (hashes[j], val); + } + beg++; + } } } |