From 4932b636ef23b7b0c9c356c9144457039253945e Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 11 May 2016 15:54:47 +0100 Subject: [PATCH] [Feature] Add more algorithms for shingles generation --- src/libutil/shingles.c | 74 +++++++++++++++++++++++++++++-------- src/libutil/shingles.h | 9 ++++- src/plugins/fuzzy_check.c | 3 +- test/rspamd_shingles_test.c | 31 ++++++++++------ 4 files changed, 88 insertions(+), 29 deletions(-) diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index c2acae6d3..04a9975bc 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -24,7 +24,8 @@ rspamd_shingles_generate (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, - gpointer filterd) + gpointer filterd, + enum rspamd_shingle_alg alg) { struct rspamd_shingle *res; GArray *hashes[RSPAMD_SHINGLE_SIZE]; @@ -35,7 +36,8 @@ rspamd_shingles_generate (GArray *input, rspamd_ftok_t *word; rspamd_cryptobox_hash_state_t bs; guint64 val; - gint i, j, beg = 0; + gint i, j, k, beg = 0; + enum rspamd_cryptobox_fast_hash_type ht; if (pool != NULL) { res = rspamd_mempool_alloc (pool, sizeof (*res)); @@ -71,22 +73,64 @@ rspamd_shingles_generate (GArray *input, } /* Now parse input words into a vector of hashes using rolling window */ - for (i = 0; i <= (gint)input->len; i ++) { - if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { - for (j = beg; j < i; j ++) { - word = &g_array_index (input, rspamd_ftok_t, j); - row = rspamd_fstring_append (row, word->begin, word->len); + if (alg == RSPAMD_SHINGLES_OLD) { + for (i = 0; i <= (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { + for (j = beg; j < i; j ++) { + word = &g_array_index (input, rspamd_ftok_t, j); + row = rspamd_fstring_append (row, word->begin, word->len); + } + beg++; + + /* Now we need to create a new row here */ + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, + keys[j]); + g_array_append_val (hashes[j], val); + } + + row = rspamd_fstring_assign (row, "", 0); } - beg++; + } + } + else { + guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE]; - /* Now we need to create a new row here */ - for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { - rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len, - keys[j]); - g_array_append_val (hashes[j], val); - } + if (alg == RSPAMD_SHINGLES_XXHASH) { + ht = RSPAMD_CRYPTOBOX_XXHASH64; + } + else { + ht = RSPAMD_CRYPTOBOX_MUMHASH; + } - row = rspamd_fstring_assign (row, "", 0); + memset (res, 0, sizeof (res)); + + for (i = 0; i <= (gint)input->len; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) { + + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + /* Shift hashes window to right */ + for (k = 0; k < SHINGLES_WINDOW - 1; k ++) { + res[j * SHINGLES_WINDOW + k] = + res[j * SHINGLES_WINDOW + k + 1]; + } + + word = &g_array_index (input, rspamd_ftok_t, beg); + /* Insert the last element to the pipe */ + res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = + rspamd_cryptobox_fast_hash_specific (ht, + word->begin, word->len, + *(guint64 *)keys[j]); + + val = 0; + for (k = 0; k < SHINGLES_WINDOW; k ++) { + val ^= res[j * SHINGLES_WINDOW + k]; + } + + g_array_append_val (hashes[j], val); + } + beg++; + } } } diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index d252a78f6..a46a2fc0f 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -25,6 +25,12 @@ struct rspamd_shingle { guint64 hashes[RSPAMD_SHINGLE_SIZE]; }; +enum rspamd_shingle_alg { + RSPAMD_SHINGLES_OLD = 0, + RSPAMD_SHINGLES_XXHASH, + RSPAMD_SHINGLES_MUMHASH, +}; + /** * Shingles filtering function * @param input input array of hashes @@ -48,7 +54,8 @@ struct rspamd_shingle* rspamd_shingles_generate (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, - gpointer filterd); + gpointer filterd, + enum rspamd_shingle_alg alg); /** * Compares two shingles and return result as a floating point value - 1.0 diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index a5b62875e..304ce3704 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -1068,7 +1068,8 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, rule->shingles_key->str); sh = rspamd_shingles_generate (words, rule->shingles_key->str, pool, - rspamd_shingles_default_filter, NULL); + rspamd_shingles_default_filter, NULL, + RSPAMD_SHINGLES_OLD); if (sh != NULL) { memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl)); shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; diff --git a/test/rspamd_shingles_test.c b/test/rspamd_shingles_test.c index 3eed5eee5..cc3b34da7 100644 --- a/test/rspamd_shingles_test.c +++ b/test/rspamd_shingles_test.c @@ -81,7 +81,8 @@ free_fuzzy_words (GArray *ar) } static void -test_case (gsize cnt, gsize max_len, gdouble perm_factor) +test_case (gsize cnt, gsize max_len, gdouble perm_factor, + enum rspamd_shingle_alg alg) { GArray *input; struct rspamd_shingle *sgl, *sgl_permuted; @@ -93,17 +94,18 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor) input = generate_fuzzy_words (cnt, max_len); ts1 = rspamd_get_ticks (); sgl = rspamd_shingles_generate (input, key, NULL, - rspamd_shingles_default_filter, NULL); + rspamd_shingles_default_filter, NULL, alg); ts2 = rspamd_get_ticks (); permute_vector (input, perm_factor); sgl_permuted = rspamd_shingles_generate (input, key, NULL, - rspamd_shingles_default_filter, NULL); + rspamd_shingles_default_filter, NULL, alg); res = rspamd_shingles_compare (sgl, sgl_permuted); - msg_debug ("percentage of common shingles: %.3f, generate time: %hd usec", - res, (gint)(ts1 - ts2) * 1000); - g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.20); + msg_info ("%d (%z words of %z max len, %.2f perm factor):" + " percentage of common shingles: %.3f, generate time: %.4f sec", + alg, cnt, max_len, perm_factor, res, ts2 - ts1); + g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.25); free_fuzzy_words (input); g_free (sgl); @@ -113,10 +115,15 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor) void rspamd_shingles_test_func (void) { - //test_case (5, 100, 0.5); - test_case (200, 10, 0.1); - test_case (500, 20, 0.01); - test_case (5000, 20, 0.01); - test_case (5000, 15, 0); - test_case (5000, 30, 1.0); + enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD; + + for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_MUMHASH; alg ++) { + test_case (200, 10, 0.1, alg); + test_case (500, 20, 0.01, alg); + test_case (5000, 20, 0.01, alg); + test_case (5000, 15, 0, alg); + test_case (5000, 30, 1.0, alg); + test_case (50000, 30, 0.02, alg); + test_case (50000, 5, 0.02, alg); + } } -- 2.39.5