]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Use cached shingles keys
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 3 Apr 2017 12:52:53 +0000 (13:52 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 3 Apr 2017 12:52:53 +0000 (13:52 +0100)
src/libutil/shingles.c

index 2dd7faffc15d7f9f7485597fef45b8aa1cc5f5aa..ddac12ac443b854533863d20c808ba057738fdd3 100644 (file)
 #include "libstat/stat_api.h"
 
 #define SHINGLES_WINDOW 3
+#define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES
+
+static guint
+rspamd_shingles_keys_hash (gconstpointer k)
+{
+       return rspamd_cryptobox_fast_hash (k, SHINGLES_KEY_SIZE,
+                       rspamd_hash_seed ());
+}
+
+static gboolean
+rspamd_shingles_keys_equal (gconstpointer k1, gconstpointer k2)
+{
+       return (memcmp (k1, k2, SHINGLES_KEY_SIZE) == 0);
+}
+
+static void
+rspamd_shingles_keys_free (gpointer p)
+{
+       guchar **k = p;
+       guint i;
+
+       for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+               g_free (k[i]);
+       }
+
+       g_free (k);
+}
+
+static guchar **
+rspamd_shingles_keys_new (void)
+{
+       guchar **k;
+       guint i;
+
+       k = g_malloc0 (sizeof (guchar *) * RSPAMD_SHINGLE_SIZE);
+
+       for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+               k[i] = g_malloc0 (sizeof (guchar) * SHINGLES_KEY_SIZE);
+       }
+
+       return k;
+}
+
+static guchar **
+rspamd_shingles_get_keys_cached (const guchar key[SHINGLES_KEY_SIZE])
+{
+       static GHashTable *ht = NULL;
+       guchar **keys = NULL, *key_cpy;
+       rspamd_cryptobox_hash_state_t bs;
+       const guchar *cur_key;
+       guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
+       guint i;
+
+       if (ht == NULL) {
+               ht = g_hash_table_new_full (rspamd_shingles_keys_hash,
+                               rspamd_shingles_keys_equal, g_free, rspamd_shingles_keys_free);
+       }
+       else {
+               keys = g_hash_table_lookup (ht, key);
+       }
+
+       if (keys == NULL) {
+               keys = rspamd_shingles_keys_new ();
+               key_cpy = g_malloc (SHINGLES_KEY_SIZE);
+               memcpy (key_cpy, key, SHINGLES_KEY_SIZE);
+
+               /* Generate keys */
+               rspamd_cryptobox_hash_init (&bs, NULL, 0);
+               cur_key = key;
+               out_key = keys[0];
+
+               for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+                       /*
+                        * To generate a set of hashes we just apply sha256 to the
+                        * initial key as many times as many hashes are required and
+                        * xor left and right parts of sha256 to get a single 16 bytes SIP key.
+                        */
+                       rspamd_cryptobox_hash_update (&bs, cur_key, 16);
+                       rspamd_cryptobox_hash_final (&bs, shabuf);
+
+                       memcpy (out_key, shabuf, 16);
+                       rspamd_cryptobox_hash_init (&bs, NULL, 0);
+                       cur_key = out_key;
+                       out_key = keys[i + 1];
+               }
+
+               g_hash_table_insert (ht, key_cpy, keys);
+       }
+
+       return keys;
+}
 
 struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
 rspamd_shingles_from_text (GArray *input,
@@ -31,12 +122,9 @@ rspamd_shingles_from_text (GArray *input,
 {
        struct rspamd_shingle *res;
        guint64 **hashes;
-       rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE];
-       guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
-       const guchar *cur_key;
+       guchar **keys;
        rspamd_fstring_t *row;
        rspamd_stat_token_t *word;
-       rspamd_cryptobox_hash_state_t bs;
        guint64 val;
        gint i, j, k;
        gsize hlen, beg = 0;
@@ -49,29 +137,16 @@ rspamd_shingles_from_text (GArray *input,
                res = g_malloc (sizeof (*res));
        }
 
-       rspamd_cryptobox_hash_init (&bs, NULL, 0);
        row = rspamd_fstring_sized_new (256);
-       cur_key = key;
-       out_key = (guchar *)&keys[0];
 
        /* Init hashes pipes and keys */
        hashes = g_slice_alloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
-       hlen = input->len > SHINGLES_WINDOW ? (input->len - SHINGLES_WINDOW + 1) : 1;
+       hlen = input->len > SHINGLES_WINDOW ?
+                       (input->len - SHINGLES_WINDOW + 1) : 1;
+       keys = rspamd_shingles_get_keys_cached (key);
 
        for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
                hashes[i] = g_slice_alloc (hlen * sizeof (guint64));
-               /*
-                * To generate a set of hashes we just apply sha256 to the
-                * initial key as many times as many hashes are required and
-                * xor left and right parts of sha256 to get a single 16 bytes SIP key.
-                */
-               rspamd_cryptobox_hash_update (&bs, cur_key, 16);
-               rspamd_cryptobox_hash_final (&bs, shabuf);
-
-               memcpy (out_key, shabuf, 16);
-               rspamd_cryptobox_hash_init (&bs, NULL, 0);
-               cur_key = out_key;
-               out_key += 16;
        }
 
        /* Now parse input words into a vector of hashes using rolling window */