]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Further micro-optimizations for hashing and shingles
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 May 2016 08:48:29 +0000 (09:48 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 12 May 2016 08:48:29 +0000 (09:48 +0100)
config.h.in
contrib/mumhash/mum.h
src/libcryptobox/cryptobox.c
src/libcryptobox/cryptobox.h
src/libutil/shingles.c
src/libutil/shingles.h
test/rspamd_shingles_test.c

index d858f65c6590bf2b33e504f2a155849b826ca253..c837500f67ddbcb3c8c767607e0f0a78fe0e7fe9 100644 (file)
@@ -323,8 +323,10 @@ typedef off_t goffset;
 # define RSPAMD_ALIGNED(x) __declspec(align(x))
 #elif defined(__GNUC__)
 # define RSPAMD_ALIGNED(x) __attribute__((aligned(x)))
+# define RSPAMD_OPTIMIZE(x) __attribute__((__optimize__ (x)))
 #else
 # define RSPAMD_ALIGNED(x)
+# define RSPAMD_OPTIMIZE(x)
 #endif
 #endif
 
index a9a661661cffcbc8dd0f7caa3d0b2f855e9f2648..1daebf3de94797e336ae361641e4e59d1e992815 100644 (file)
@@ -43,6 +43,7 @@
 #ifndef __MUM_HASH__
 #define __MUM_HASH__
 
+#include "config.h"
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
@@ -56,8 +57,8 @@ typedef unsigned __int64 uint64_t;
 
 #ifdef __GNUC__
 #define _MUM_ATTRIBUTE_UNUSED  __attribute__((unused))
-#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
-#define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
+#define _MUM_OPTIMIZE(opts) RSPAMD_OPTIMIZE(opts)
+#define _MUM_TARGET(opts)
 #else
 #define _MUM_ATTRIBUTE_UNUSED
 #define _MUM_OPTIMIZE(opts)
index d8fc737d78ac81611fb1bf3fb2d5008ba6a08972..162fcd6622d4f6ee5745a43208aaacf983e4dd94 100644 (file)
@@ -1459,20 +1459,17 @@ guint64
 rspamd_cryptobox_fast_hash (const void *data,
                gsize len, guint64 seed)
 {
-       if (len % 2 == 0) {
+       if (len > 8 && len % 8 == 0) {
                return rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_MUMHASH,
                                        data, len, seed);
        }
        else {
 #if defined(__LP64__) || defined(_LP64)
-               if (len > 8) {
-                       return rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
-                                       data, len, seed);
-               }
+               return rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
+                               data, len, seed);
 #endif
        }
 
-
        return rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH32,
                        data, len, seed);
 }
@@ -1490,7 +1487,9 @@ rspamd_cryptobox_fast_hash_specific (
        case RSPAMD_CRYPTOBOX_XXHASH64:
                return XXH64 (data, len, seed);
        case RSPAMD_CRYPTOBOX_MUMHASH:
-       default:
                return mum_hash (data, len, seed);
+       case RSPAMD_CRYPTOBOX_HASHFAST:
+       default:
+               return rspamd_cryptobox_fast_hash (data, len, seed);
        }
 }
index 18e66df609d62c12b83f5f55d642c4f77ec51347..c44044f4804d59c12a8a7d206ba4030810197b10 100644 (file)
@@ -357,7 +357,8 @@ guint64 rspamd_cryptobox_fast_hash (const void *data,
 enum rspamd_cryptobox_fast_hash_type {
        RSPAMD_CRYPTOBOX_XXHASH64 = 0,
        RSPAMD_CRYPTOBOX_XXHASH32,
-       RSPAMD_CRYPTOBOX_MUMHASH
+       RSPAMD_CRYPTOBOX_MUMHASH,
+       RSPAMD_CRYPTOBOX_HASHFAST
 };
 /**
  * Platform independent version
index 3e238fa5c75a7b4e42fd1036efef4de3e8d489af..8d1b147db93f169ecc67714c1aef31ce8ec9dbac 100644 (file)
@@ -19,7 +19,7 @@
 
 #define SHINGLES_WINDOW 3
 
-struct rspamd_shingle*
+struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
 rspamd_shingles_generate (GArray *input,
                const guchar key[16],
                rspamd_mempool_t *pool,
@@ -28,7 +28,7 @@ rspamd_shingles_generate (GArray *input,
                enum rspamd_shingle_alg alg)
 {
        struct rspamd_shingle *res;
-       GArray *hashes[RSPAMD_SHINGLE_SIZE];
+       guint64 **hashes;
        rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE];
        guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
        const guchar *cur_key;
@@ -36,7 +36,8 @@ rspamd_shingles_generate (GArray *input,
        rspamd_ftok_t *word;
        rspamd_cryptobox_hash_state_t bs;
        guint64 val;
-       gint i, j, k, beg = 0;
+       gint i, j, k;
+       gsize hlen, beg = 0;
        enum rspamd_cryptobox_fast_hash_type ht;
 
        if (pool != NULL) {
@@ -52,9 +53,11 @@ rspamd_shingles_generate (GArray *input,
        out_key = (guchar *)&keys[0];
 
        /* Init hashes pipes and keys */
+       hashes = g_slice_alloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
+       hlen = input->len > SHINGLES_WINDOW ? (input->len - SHINGLES_WINDOW + 1) : 1;
+
        for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
-               hashes[i] = g_array_sized_new (FALSE, FALSE, sizeof (guint64),
-                               input->len + SHINGLES_WINDOW);
+               hashes[i] = g_slice_alloc (hlen * sizeof (guint64));
                /*
                 * To generate a set of hashes we just apply sha256 to the
                 * initial key as many times as many hashes are required and
@@ -80,29 +83,34 @@ rspamd_shingles_generate (GArray *input,
                                        word = &g_array_index (input, rspamd_ftok_t, j);
                                        row = rspamd_fstring_append (row, word->begin, word->len);
                                }
-                               beg++;
 
                                /* Now we need to create a new row here */
                                for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
                                        rspamd_cryptobox_siphash ((guchar *)&val, row->str, row->len,
                                                        keys[j]);
-                                       g_array_append_val (hashes[j], val);
+                                       g_assert (hlen > beg);
+                                       hashes[j][beg] = val;
                                }
 
+                               beg++;
+
                                row = rspamd_fstring_assign (row, "", 0);
                        }
                }
        }
        else {
                guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE];
-               guint64 RSPAMD_ALIGNED(32) tmpbuf[16];
-               guint rlen;
 
-               if (alg == RSPAMD_SHINGLES_XXHASH) {
+               switch (alg) {
+               case RSPAMD_SHINGLES_XXHASH:
                        ht = RSPAMD_CRYPTOBOX_XXHASH64;
-               }
-               else {
+                       break;
+               case RSPAMD_SHINGLES_MUMHASH:
                        ht = RSPAMD_CRYPTOBOX_MUMHASH;
+                       break;
+               default:
+                       ht = RSPAMD_CRYPTOBOX_HASHFAST;
+                       break;
                }
 
                memset (res, 0, sizeof (res));
@@ -119,26 +127,17 @@ rspamd_shingles_generate (GArray *input,
 
                                        word = &g_array_index (input, rspamd_ftok_t, beg);
                                        /* Insert the last element to the pipe */
-                                       if (word->len >= sizeof (tmpbuf)) {
-                                               rlen = sizeof (tmpbuf);
-                                               memcpy (tmpbuf, word->begin, rlen);
-                                       }
-                                       else {
-                                               rlen = word->len / sizeof (guint64) + 1;
-                                               memset (tmpbuf, 0, rlen * sizeof (guint64));
-                                               memcpy (tmpbuf, word->begin, word->len);
-                                       }
-
                                        res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
                                                        rspamd_cryptobox_fast_hash_specific (ht,
-                                                                       tmpbuf,rlen * sizeof (guint64),
+                                                                       word->begin, word->len,
                                                                        *(guint64 *)keys[j]);
                                        val = 0;
                                        for (k = 0; k < SHINGLES_WINDOW; k ++) {
                                                val ^= res[j * SHINGLES_WINDOW + k];
                                        }
 
-                                       g_array_append_val (hashes[j], val);
+                                       g_assert (hlen > beg);
+                                       hashes[j][beg] = val;
                                }
                                beg++;
                        }
@@ -147,11 +146,13 @@ rspamd_shingles_generate (GArray *input,
 
        /* Now we need to filter all hashes and make a shingles result */
        for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
-               res->hashes[i] = filter ((guint64 *)hashes[i]->data, hashes[i]->len,
+               res->hashes[i] = filter (hashes[i], hlen,
                                i, key, filterd);
-               g_array_free (hashes[i], TRUE);
+               g_slice_free1 (hlen * sizeof (guint64), hashes[i]);
        }
 
+       g_slice_free1 (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE, hashes);
+
        rspamd_fstring_free (row);
 
        return res;
index a46a2fc0f26a0d6b9b1eaed8c7a0cfb2427014f2..fd7d3bfc70e52b358ac5c916f8f0d0869b597e6e 100644 (file)
@@ -29,6 +29,7 @@ enum rspamd_shingle_alg {
        RSPAMD_SHINGLES_OLD = 0,
        RSPAMD_SHINGLES_XXHASH,
        RSPAMD_SHINGLES_MUMHASH,
+       RSPAMD_SHINGLES_FAST
 };
 
 /**
index a4289497ebe91c6d0751f6973e44584942d89489..5669137d4fc4e4f1c8a2592520c68beb180f720b 100644 (file)
@@ -93,10 +93,10 @@ test_case (gsize cnt, gsize max_len, gdouble perm_factor,
 
        ottery_rand_bytes (key, sizeof (key));
        input = generate_fuzzy_words (cnt, max_len);
-       ts1 = rspamd_get_ticks ();
+       ts1 = rspamd_get_virtual_ticks ();
        sgl = rspamd_shingles_generate (input, key, NULL,
                        rspamd_shingles_default_filter, NULL, alg);
-       ts2 = rspamd_get_ticks ();
+       ts2 = rspamd_get_virtual_ticks ();
        permute_vector (input, perm_factor);
        sgl_permuted = rspamd_shingles_generate (input, key, NULL,
                        rspamd_shingles_default_filter, NULL, alg);
@@ -118,7 +118,7 @@ rspamd_shingles_test_func (void)
 {
        enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;
 
-       for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_MUMHASH; alg ++) {
+       for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_FAST; alg ++) {
                test_case (200, 10, 0.1, alg);
                test_case (500, 20, 0.01, alg);
                test_case (5000, 20, 0.01, alg);