]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Add more algorithms for shingles generation
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 11 May 2016 14:54:47 +0000 (15:54 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 11 May 2016 14:54:47 +0000 (15:54 +0100)
src/libutil/shingles.c
src/libutil/shingles.h
src/plugins/fuzzy_check.c
test/rspamd_shingles_test.c

index c2acae6d310c88ff81724499b4394d45f2651718..04a9975bc0a827abe33c3da76170a32ebda6ea8e 100644 (file)
@@ -24,7 +24,8 @@ rspamd_shingles_generate (GArray *input,
                const guchar key[16],
                rspamd_mempool_t *pool,
                rspamd_shingles_filter filter,
-               gpointer filterd)
+               gpointer filterd,
+               enum rspamd_shingle_alg alg)
 {
        struct rspamd_shingle *res;
        GArray *hashes[RSPAMD_SHINGLE_SIZE];
@@ -35,7 +36,8 @@ rspamd_shingles_generate (GArray *input,
        rspamd_ftok_t *word;
        rspamd_cryptobox_hash_state_t bs;
        guint64 val;
-       gint i, j, beg = 0;
+       gint i, j, k, beg = 0;
+       enum rspamd_cryptobox_fast_hash_type ht;
 
        if (pool != NULL) {
                res = rspamd_mempool_alloc (pool, sizeof (*res));
@@ -71,22 +73,64 @@ rspamd_shingles_generate (GArray *input,
        }
 
        /* Now parse input words into a vector of hashes using rolling window */
-       for (i = 0; i <= (gint)input->len; i ++) {
-               if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
-                       for (j = beg; j < i; j ++) {
-                               word = &g_array_index (input, rspamd_ftok_t, j);
-                               row = rspamd_fstring_append (row, word->begin, word->len);
+       if (alg == RSPAMD_SHINGLES_OLD) {
+               for (i = 0; i <= (gint)input->len; i ++) {
+                       if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
+                               for (j = beg; j < i; j ++) {
+                                       word = &g_array_index (input, rspamd_ftok_t, j);
+                                       row = rspamd_fstring_append (row, word->begin, word->len);
+                               }
+                               beg++;
+
+                               /* Now we need to create a new row here */
+                               for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+                                       rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
+                                                       keys[j]);
+                                       g_array_append_val (hashes[j], val);
+                               }
+
+                               row = rspamd_fstring_assign (row, "", 0);
                        }
-                       beg++;
+               }
+       }
+       else {
+               guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE];
 
-                       /* Now we need to create a new row here */
-                       for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
-                               rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
-                                               keys[j]);
-                               g_array_append_val (hashes[j], val);
-                       }
+               if (alg == RSPAMD_SHINGLES_XXHASH) {
+                       ht = RSPAMD_CRYPTOBOX_XXHASH64;
+               }
+               else {
+                       ht = RSPAMD_CRYPTOBOX_MUMHASH;
+               }
 
-                       row = rspamd_fstring_assign (row, "", 0);
+               memset (res, 0, sizeof (res));
+
+               for (i = 0; i <= (gint)input->len; i ++) {
+                       if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
+
+                               for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+                                       /* Shift hashes window to right */
+                                       for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
+                                               res[j * SHINGLES_WINDOW + k] =
+                                                               res[j * SHINGLES_WINDOW + k + 1];
+                                       }
+
+                                       word = &g_array_index (input, rspamd_ftok_t, beg);
+                                       /* Insert the last element to the pipe */
+                                       res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+                                                       rspamd_cryptobox_fast_hash_specific (ht,
+                                                                       word->begin, word->len,
+                                                                       *(guint64 *)keys[j]);
+
+                                       val = 0;
+                                       for (k = 0; k < SHINGLES_WINDOW; k ++) {
+                                               val ^= res[j * SHINGLES_WINDOW + k];
+                                       }
+
+                                       g_array_append_val (hashes[j], val);
+                               }
+                               beg++;
+                       }
                }
        }
 
index d252a78f62b1f9ec34035f711bfa936371fc5147..a46a2fc0f26a0d6b9b1eaed8c7a0cfb2427014f2 100644 (file)
@@ -25,6 +25,12 @@ struct rspamd_shingle {
        guint64 hashes[RSPAMD_SHINGLE_SIZE];
 };
 
+enum rspamd_shingle_alg {
+       RSPAMD_SHINGLES_OLD = 0,
+       RSPAMD_SHINGLES_XXHASH,
+       RSPAMD_SHINGLES_MUMHASH,
+};
+
 /**
  * Shingles filtering function
  * @param input input array of hashes
@@ -48,7 +54,8 @@ struct rspamd_shingle* rspamd_shingles_generate (GArray *input,
                const guchar key[16],
                rspamd_mempool_t *pool,
                rspamd_shingles_filter filter,
-               gpointer filterd);
+               gpointer filterd,
+               enum rspamd_shingle_alg alg);
 
 /**
  * Compares two shingles and return result as a floating point value - 1.0
index a5b62875e7e29403911864db6a654c080c7bf721..304ce37041f0d5eb4c9a846296f37f8c90564894 100644 (file)
@@ -1068,7 +1068,8 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
                        rule->shingles_key->str);
        sh = rspamd_shingles_generate (words,
                        rule->shingles_key->str, pool,
-                       rspamd_shingles_default_filter, NULL);
+                       rspamd_shingles_default_filter, NULL,
+                       RSPAMD_SHINGLES_OLD);
        if (sh != NULL) {
                memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
                shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
index 3eed5eee5896b79fd91a8e7014da462ff88a1de8..cc3b34da729a03ab7d73ca2ac5446c489846196d 100644 (file)
@@ -81,7 +81,8 @@ free_fuzzy_words (GArray *ar)
 }
 
 static void
-test_case (gsize cnt, gsize max_len, gdouble perm_factor)
+test_case (gsize cnt, gsize max_len, gdouble perm_factor,
+               enum rspamd_shingle_alg alg)
 {
        GArray *input;
        struct rspamd_shingle *sgl, *sgl_permuted;
@@ -93,17 +94,18 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor)
        input = generate_fuzzy_words (cnt, max_len);
        ts1 = rspamd_get_ticks ();
        sgl = rspamd_shingles_generate (input, key, NULL,
-                       rspamd_shingles_default_filter, NULL);
+                       rspamd_shingles_default_filter, NULL, alg);
        ts2 = rspamd_get_ticks ();
        permute_vector (input, perm_factor);
        sgl_permuted = rspamd_shingles_generate (input, key, NULL,
-                       rspamd_shingles_default_filter, NULL);
+                       rspamd_shingles_default_filter, NULL, alg);
 
        res = rspamd_shingles_compare (sgl, sgl_permuted);
 
-       msg_debug ("percentage of common shingles: %.3f, generate time: %hd usec",
-                       res, (gint)(ts1 - ts2) * 1000);
-       g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.20);
+       msg_info ("%d (%z words of %z max len, %.2f perm factor):"
+                       " percentage of common shingles: %.3f, generate time: %.4f sec",
+                       alg, cnt, max_len, perm_factor, res, ts2 - ts1);
+       g_assert_cmpfloat (fabs ((1.0 - res) - sqrt (perm_factor)), <=, 0.25);
 
        free_fuzzy_words (input);
        g_free (sgl);
@@ -113,10 +115,15 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor)
 void
 rspamd_shingles_test_func (void)
 {
-       //test_case (5, 100, 0.5);
-       test_case (200, 10, 0.1);
-       test_case (500, 20, 0.01);
-       test_case (5000, 20, 0.01);
-       test_case (5000, 15, 0);
-       test_case (5000, 30, 1.0);
+       enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;
+
+       for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_MUMHASH; alg ++) {
+               test_case (200, 10, 0.1, alg);
+               test_case (500, 20, 0.01, alg);
+               test_case (5000, 20, 0.01, alg);
+               test_case (5000, 15, 0, alg);
+               test_case (5000, 30, 1.0, alg);
+               test_case (50000, 30, 0.02, alg);
+               test_case (50000, 5, 0.02, alg);
+       }
 }