From 4fd2f26b9f0985a64122187313cec5ae4ed35c78 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 9 Dec 2016 12:50:35 +0000 Subject: [PATCH] [Feature] Implement min-hash shingles for DCT data from images --- src/libutil/shingles.c | 115 ++++++++++++++++++++++++++++++++++++++++- src/libutil/shingles.h | 18 ++++++- 2 files changed, 131 insertions(+), 2 deletions(-) diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c index 0f35a07e0..9d1068c31 100644 --- a/src/libutil/shingles.c +++ b/src/libutil/shingles.c @@ -20,7 +20,7 @@ #define SHINGLES_WINDOW 3 struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops") -rspamd_shingles_generate (GArray *input, +rspamd_shingles_from_text (GArray *input, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, @@ -160,6 +160,119 @@ rspamd_shingles_generate (GArray *input, return res; } +struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops") +rspamd_shingles_from_image (gdouble *dct, + const guchar key[16], + rspamd_mempool_t *pool, + rspamd_shingles_filter filter, + gpointer filterd, + enum rspamd_shingle_alg alg) +{ + struct rspamd_shingle *shingle; + guint64 **hashes; + rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE]; + guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key; + const guchar *cur_key; + gdouble d; + rspamd_cryptobox_hash_state_t bs; + guint64 val; + gint i, j, k; + gsize hlen, beg = 0; + enum rspamd_cryptobox_fast_hash_type ht; + guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed; + + if (pool != NULL) { + shingle = rspamd_mempool_alloc (pool, sizeof (*shingle)); + } + else { + shingle = g_malloc (sizeof (*shingle)); + } + + rspamd_cryptobox_hash_init (&bs, NULL, 0); + cur_key = key; + out_key = (guchar *)&keys[0]; + + /* Init hashes pipes and keys */ + hashes = g_slice_alloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE); + hlen = 64 - SHINGLES_WINDOW + 1; + + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + hashes[i] = g_slice_alloc (hlen * sizeof (guint64)); + /* + * To generate a set of hashes we just apply sha256 to the + * initial key as many times as many hashes are required and + * xor left and right parts of sha256 to get a single 16 bytes SIP key. + */ + rspamd_cryptobox_hash_update (&bs, cur_key, 16); + rspamd_cryptobox_hash_final (&bs, shabuf); + + for (j = 0; j < 16; j ++) { + out_key[j] = shabuf[j]; + } + + rspamd_cryptobox_hash_init (&bs, NULL, 0); + cur_key = out_key; + out_key += 16; + } + + switch (alg) { + case RSPAMD_SHINGLES_OLD: + ht = RSPAMD_CRYPTOBOX_MUMHASH; + break; + case RSPAMD_SHINGLES_XXHASH: + ht = RSPAMD_CRYPTOBOX_XXHASH64; + break; + case RSPAMD_SHINGLES_MUMHASH: + ht = RSPAMD_CRYPTOBOX_MUMHASH; + break; + default: + ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT; + break; + } + + memset (res, 0, sizeof (res)); + + for (i = 0; i <= 64; i ++) { + if (i - beg >= SHINGLES_WINDOW || i == 64) { + for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) { + /* Shift hashes window to right */ + for (k = 0; k < SHINGLES_WINDOW - 1; k ++) { + res[j * SHINGLES_WINDOW + k] = + res[j * SHINGLES_WINDOW + k + 1]; + } + + d = dct[beg]; + /* Insert the last element to the pipe */ + memcpy (&seed, keys[j], sizeof (seed)); + res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] = + rspamd_cryptobox_fast_hash_specific (ht, + &d, sizeof (d), + seed); + val = 0; + for (k = 0; k < SHINGLES_WINDOW; k ++) { + val ^= res[j * SHINGLES_WINDOW + k] >> + (8 * (SHINGLES_WINDOW - k - 1)); + } + + g_assert (hlen > beg); + hashes[j][beg] = val; + } + beg++; + } + } + + /* Now we need to filter all hashes and make a shingles result */ + for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) { + shingle->hashes[i] = filter (hashes[i], hlen, + i, key, filterd); + g_slice_free1 (hlen * sizeof (guint64), hashes[i]); + } + + g_slice_free1 (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE, hashes); + + return shingle; +} + guint64 rspamd_shingles_default_filter (guint64 *input, gsize count, diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index fd7d3bfc7..5f455cdf8 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -51,7 +51,23 @@ typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count, * @param filterd opaque data for filtering function * @return shingles array */ -struct rspamd_shingle* rspamd_shingles_generate (GArray *input, +struct rspamd_shingle* rspamd_shingles_from_text (GArray *input, + const guchar key[16], + rspamd_mempool_t *pool, + rspamd_shingles_filter filter, + gpointer filterd, + enum rspamd_shingle_alg alg); + +/** + * Generate shingles from the DCT matrix of an image + * @param dct discrete cosine transfor matrix (must be 64x64) + * @param key secret key used to generate shingles + * @param pool pool to allocate shigles array + * @param filter hashes filtering function + * @param filterd opaque data for filtering function + * @return shingles array + */ +struct rspamd_shingle* rspamd_shingles_from_image (gdouble *dct, const guchar key[16], rspamd_mempool_t *pool, rspamd_shingles_filter filter, -- 2.39.5