diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-05-11 15:54:47 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2016-05-11 15:54:47 +0100 |
commit | 4932b636ef23b7b0c9c356c9144457039253945e (patch) | |
tree | 0f086aac6c37acc408a85fcfd45800144b63574c /src | |
parent | bcb5eaadd0b810b399fbdb133253bbc9ec7f060e (diff) | |
download | rspamd-4932b636ef23b7b0c9c356c9144457039253945e.tar.gz rspamd-4932b636ef23b7b0c9c356c9144457039253945e.zip |
[Feature] Add more algorithms for shingles generation
Diffstat (limited to 'src')
-rw-r--r-- | src/libutil/shingles.c | 74 | ||||
-rw-r--r-- | src/libutil/shingles.h | 9 | ||||
-rw-r--r-- | src/plugins/fuzzy_check.c | 3 |
3 files changed, 69 insertions, 17 deletions
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index c2acae6d3..04a9975bc 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -24,7 +24,8 @@ rspamd_shingles_generate (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, - gpointer filterd) + gpointer filterd, + enum rspamd_shingle_alg alg) { struct rspamd_shingle *res; GArray *hashes[RSPAMD_SHINGLE_SIZE]; @@ -35,7 +36,8 @@ rspamd_shingles_generate (GArray *input, rspamd_ftok_t *word; rspamd_cryptobox_hash_state_t bs; guint64 val; - gint i, j, beg = 0; + gint i, j, k, beg = 0; + enum rspamd_cryptobox_fast_hash_type ht; if (pool != NULL) { res = rspamd_mempool_alloc (pool, sizeof (*res)); @@ -71,22 +73,64 @@ rspamd_shingles_generate (GArray *input, } /* Now parse input words into a vector of hashes using rolling window */ - for (i = 0; i <= (gint)input->len; i ++) { - if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { - for (j = beg; j < i; j ++) { - word = &g_array_index (input, rspamd_ftok_t, j); - row = rspamd_fstring_append (row, word->begin, word->len); + if (alg == RSPAMD_SHINGLES_OLD) { + for (i = 0; i <= (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { + for (j = beg; j < i; j ++) { + word = &g_array_index (input, rspamd_ftok_t, j); + row = rspamd_fstring_append (row, word->begin, word->len); + } + beg++; + + /* Now we need to create a new row here */ + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, + keys[j]); + g_array_append_val (hashes[j], val); + } + + row = rspamd_fstring_assign (row, "", 0); } - beg++; + } + } + else { + guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE]; - /* Now we need to create a new row here */ - for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { - rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, - keys[j]); - g_array_append_val (hashes[j], val); - } + if (alg == RSPAMD_SHINGLES_XXHASH) { + ht = RSPAMD_CRYPTOBOX_XXHASH64; + } + else { + ht = RSPAMD_CRYPTOBOX_MUMHASH; + } - row = rspamd_fstring_assign (row, "", 0); + memset (res, 0, sizeof (res)); + + for (i = 0; i <= (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { + + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + /* Shift hashes window to right */ + for (k = 0; k < SHINGLES_WINDOW - 1; k ++) { + res[j * SHINGLES_WINDOW + k] = + res[j * SHINGLES_WINDOW + k + 1]; + } + + word = &g_array_index (input, rspamd_ftok_t, beg); + /* Insert the last element to the pipe */ + res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = + rspamd_cryptobox_fast_hash_specific (ht, + word->begin, word->len, + *(guint64 *)keys[j]); + + val = 0; + for (k = 0; k < SHINGLES_WINDOW; k ++) { + val ^= res[j * SHINGLES_WINDOW + k]; + } + + g_array_append_val (hashes[j], val); + } + beg++; + } } } diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index d252a78f6..a46a2fc0f 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -25,6 +25,12 @@ struct rspamd_shingle { guint64 hashes[RSPAMD_SHINGLE_SIZE]; }; +enum rspamd_shingle_alg { + RSPAMD_SHINGLES_OLD = 0, + RSPAMD_SHINGLES_XXHASH, + RSPAMD_SHINGLES_MUMHASH, +}; + /** * Shingles filtering function * @param input input array of hashes @@ -48,7 +54,8 @@ struct rspamd_shingle* rspamd_shingles_generate (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, - gpointer filterd); + gpointer filterd, + enum rspamd_shingle_alg alg); /** * Compares two shingles and return result as a floating point value - 1.0 diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index a5b62875e..304ce3704 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -1068,7 +1068,8 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, rule->shingles_key->str); sh = rspamd_shingles_generate (words, rule->shingles_key->str, pool, - rspamd_shingles_default_filter, NULL); + rspamd_shingles_default_filter, NULL, + RSPAMD_SHINGLES_OLD); if (sh != NULL) { memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl)); shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; |