From: Vsevolod Stakhov Date: Mon, 3 Apr 2017 12:52:53 +0000 (+0100) Subject: [Feature] Use cached shingles keys X-Git-Tag: 1.5.5~61 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=de922dd854760bcd362adbc3a2dd5a7d203ad10f;p=rspamd.git [Feature] Use cached shingles keys --- diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index 2dd7faffc..ddac12ac4 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -20,6 +20,97 @@ #include "libstat/stat_api.h" #define SHINGLES_WINDOW 3 +#define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES + +static guint +rspamd_shingles_keys_hash (gconstpointer k) +{ + return rspamd_cryptobox_fast_hash (k, SHINGLES_KEY_SIZE, + rspamd_hash_seed ()); +} + +static gboolean +rspamd_shingles_keys_equal (gconstpointer k1, gconstpointer k2) +{ + return (memcmp (k1, k2, SHINGLES_KEY_SIZE) == 0); +} + +static void +rspamd_shingles_keys_free (gpointer p) +{ + guchar **k = p; + guint i; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + g_free (k[i]); + } + + g_free (k); +} + +static guchar ** +rspamd_shingles_keys_new (void) +{ + guchar **k; + guint i; + + k = g_malloc0 (sizeof (guchar *) * RSPAMD_SHINGLE_SIZE); + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + k[i] = g_malloc0 (sizeof (guchar) * SHINGLES_KEY_SIZE); + } + + return k; +} + +static guchar ** +rspamd_shingles_get_keys_cached (const guchar key[SHINGLES_KEY_SIZE]) +{ + static GHashTable *ht = NULL; + guchar **keys = NULL, *key_cpy; + rspamd_cryptobox_hash_state_t bs; + const guchar *cur_key; + guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key; + guint i; + + if (ht == NULL) { + ht = g_hash_table_new_full (rspamd_shingles_keys_hash, + rspamd_shingles_keys_equal, g_free, rspamd_shingles_keys_free); + } + else { + keys = g_hash_table_lookup (ht, key); + } + + if (keys == NULL) { + keys = rspamd_shingles_keys_new (); + key_cpy = g_malloc (SHINGLES_KEY_SIZE); + memcpy (key_cpy, key, SHINGLES_KEY_SIZE); + + /* Generate keys */ + rspamd_cryptobox_hash_init (&bs, NULL, 0); + cur_key = key; + out_key = keys[0]; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + /* + * To generate a set of hashes we just apply sha256 to the + * initial key as many times as many hashes are required and + * xor left and right parts of sha256 to get a single 16 bytes SIP key. + */ + rspamd_cryptobox_hash_update (&bs, cur_key, 16); + rspamd_cryptobox_hash_final (&bs, shabuf); + + memcpy (out_key, shabuf, 16); + rspamd_cryptobox_hash_init (&bs, NULL, 0); + cur_key = out_key; + out_key = keys[i + 1]; + } + + g_hash_table_insert (ht, key_cpy, keys); + } + + return keys; +} struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops") rspamd_shingles_from_text (GArray *input, @@ -31,12 +122,9 @@ rspamd_shingles_from_text (GArray *input, { struct rspamd_shingle *res; guint64 **hashes; - rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE]; - guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key; - const guchar *cur_key; + guchar **keys; rspamd_fstring_t *row; rspamd_stat_token_t *word; - rspamd_cryptobox_hash_state_t bs; guint64 val; gint i, j, k; gsize hlen, beg = 0; @@ -49,29 +137,16 @@ rspamd_shingles_from_text (GArray *input, res = g_malloc (sizeof (*res)); } - rspamd_cryptobox_hash_init (&bs, NULL, 0); row = rspamd_fstring_sized_new (256); - cur_key = key; - out_key = (guchar *)&keys[0]; /* Init hashes pipes and keys */ hashes = g_slice_alloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE); - hlen = input->len > SHINGLES_WINDOW ? (input->len - SHINGLES_WINDOW + 1) : 1; + hlen = input->len > SHINGLES_WINDOW ? + (input->len - SHINGLES_WINDOW + 1) : 1; + keys = rspamd_shingles_get_keys_cached (key); for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { hashes[i] = g_slice_alloc (hlen * sizeof (guint64)); - /* - * To generate a set of hashes we just apply sha256 to the - * initial key as many times as many hashes are required and - * xor left and right parts of sha256 to get a single 16 bytes SIP key. - */ - rspamd_cryptobox_hash_update (&bs, cur_key, 16); - rspamd_cryptobox_hash_final (&bs, shabuf); - - memcpy (out_key, shabuf, 16); - rspamd_cryptobox_hash_init (&bs, NULL, 0); - cur_key = out_key; - out_key += 16; } /* Now parse input words into a vector of hashes using rolling window */