summaryrefslogtreecommitdiffstats
path: root/src/libutil/shingles.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-12-09 12:50:35 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-12-09 12:51:08 +0000
commit4fd2f26b9f0985a64122187313cec5ae4ed35c78 (patch)
tree673f62e5c183c196b26f0008ddb31fe98cc6d916 /src/libutil/shingles.c
parent7ddd7a0d48c32f227a73f8d00c4e3ff34ab6feaf (diff)
downloadrspamd-4fd2f26b9f0985a64122187313cec5ae4ed35c78.tar.gz
rspamd-4fd2f26b9f0985a64122187313cec5ae4ed35c78.zip
[Feature] Implement min-hash shingles for DCT data from images
Diffstat (limited to 'src/libutil/shingles.c')
-rw-r--r--src/libutil/shingles.c115
1 files changed, 114 insertions, 1 deletions
diff --git a/src/libutil/shingles.c b/src/libutil/shingles.c
index 0f35a07e0..9d1068c31 100644
--- a/src/libutil/shingles.c
+++ b/src/libutil/shingles.c
@@ -20,7 +20,7 @@
#define SHINGLES_WINDOW 3
struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
-rspamd_shingles_generate (GArray *input,
+rspamd_shingles_from_text (GArray *input,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
@@ -160,6 +160,119 @@ rspamd_shingles_generate (GArray *input,
return res;
}
+struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
+rspamd_shingles_from_image (gdouble *dct,
+ const guchar key[16],
+ rspamd_mempool_t *pool,
+ rspamd_shingles_filter filter,
+ gpointer filterd,
+ enum rspamd_shingle_alg alg)
+{
+ struct rspamd_shingle *shingle;
+ guint64 **hashes;
+ rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE];
+ guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
+ const guchar *cur_key;
+ gdouble d;
+ rspamd_cryptobox_hash_state_t bs;
+ guint64 val;
+ gint i, j, k;
+ gsize hlen, beg = 0;
+ enum rspamd_cryptobox_fast_hash_type ht;
+ guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
+
+ if (pool != NULL) {
+ shingle = rspamd_mempool_alloc (pool, sizeof (*shingle));
+ }
+ else {
+ shingle = g_malloc (sizeof (*shingle));
+ }
+
+ rspamd_cryptobox_hash_init (&bs, NULL, 0);
+ cur_key = key;
+ out_key = (guchar *)&keys[0];
+
+ /* Init hashes pipes and keys */
+ hashes = g_slice_alloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
+ hlen = 64 - SHINGLES_WINDOW + 1;
+
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+ hashes[i] = g_slice_alloc (hlen * sizeof (guint64));
+ /*
+ * To generate a set of hashes we just apply sha256 to the
+ * initial key as many times as many hashes are required and
+ * xor left and right parts of sha256 to get a single 16 bytes SIP key.
+ */
+ rspamd_cryptobox_hash_update (&bs, cur_key, 16);
+ rspamd_cryptobox_hash_final (&bs, shabuf);
+
+ for (j = 0; j < 16; j ++) {
+ out_key[j] = shabuf[j];
+ }
+
+ rspamd_cryptobox_hash_init (&bs, NULL, 0);
+ cur_key = out_key;
+ out_key += 16;
+ }
+
+ switch (alg) {
+ case RSPAMD_SHINGLES_OLD:
+ ht = RSPAMD_CRYPTOBOX_MUMHASH;
+ break;
+ case RSPAMD_SHINGLES_XXHASH:
+ ht = RSPAMD_CRYPTOBOX_XXHASH64;
+ break;
+ case RSPAMD_SHINGLES_MUMHASH:
+ ht = RSPAMD_CRYPTOBOX_MUMHASH;
+ break;
+ default:
+ ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
+ break;
+ }
+
+ memset (res, 0, sizeof (res));
+
+ for (i = 0; i <= 64; i ++) {
+ if (i - beg >= SHINGLES_WINDOW || i == 64) {
+ for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
+ /* Shift hashes window to right */
+ for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
+ res[j * SHINGLES_WINDOW + k] =
+ res[j * SHINGLES_WINDOW + k + 1];
+ }
+
+ d = dct[beg];
+ /* Insert the last element to the pipe */
+ memcpy (&seed, keys[j], sizeof (seed));
+ res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+ rspamd_cryptobox_fast_hash_specific (ht,
+ &d, sizeof (d),
+ seed);
+ val = 0;
+ for (k = 0; k < SHINGLES_WINDOW; k ++) {
+ val ^= res[j * SHINGLES_WINDOW + k] >>
+ (8 * (SHINGLES_WINDOW - k - 1));
+ }
+
+ g_assert (hlen > beg);
+ hashes[j][beg] = val;
+ }
+ beg++;
+ }
+ }
+
+ /* Now we need to filter all hashes and make a shingles result */
+ for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+ shingle->hashes[i] = filter (hashes[i], hlen,
+ i, key, filterd);
+ g_slice_free1 (hlen * sizeof (guint64), hashes[i]);
+ }
+
+ g_slice_free1 (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE, hashes);
+
+ return shingle;
+}
+
guint64
rspamd_shingles_default_filter (guint64 *input, gsize count,