Browse Source

[Feature] Implement min-hash shingles for DCT data from images

tags/1.5.0
Vsevolod Stakhov 7 years ago
parent
commit
4fd2f26b9f
2 changed files with 131 additions and 2 deletions
  1. 114
    1
      src/libutil/shingles.c
  2. 17
    1
      src/libutil/shingles.h

+ 114
- 1
src/libutil/shingles.c View File

@@ -20,7 +20,7 @@
#define SHINGLES_WINDOW 3

struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
rspamd_shingles_generate (GArray *input,
rspamd_shingles_from_text (GArray *input,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
@@ -160,6 +160,119 @@ rspamd_shingles_generate (GArray *input,
return res;
}

struct rspamd_shingle* RSPAMD_OPTIMIZE("unroll-loops")
rspamd_shingles_from_image (gdouble *dct,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
gpointer filterd,
enum rspamd_shingle_alg alg)
{
struct rspamd_shingle *shingle;
guint64 **hashes;
rspamd_sipkey_t keys[RSPAMD_SHINGLE_SIZE];
guchar shabuf[rspamd_cryptobox_HASHBYTES], *out_key;
const guchar *cur_key;
gdouble d;
rspamd_cryptobox_hash_state_t bs;
guint64 val;
gint i, j, k;
gsize hlen, beg = 0;
enum rspamd_cryptobox_fast_hash_type ht;
guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;

if (pool != NULL) {
shingle = rspamd_mempool_alloc (pool, sizeof (*shingle));
}
else {
shingle = g_malloc (sizeof (*shingle));
}

rspamd_cryptobox_hash_init (&bs, NULL, 0);
cur_key = key;
out_key = (guchar *)&keys[0];

/* Init hashes pipes and keys */
hashes = g_slice_alloc (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE);
hlen = 64 - SHINGLES_WINDOW + 1;

for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
hashes[i] = g_slice_alloc (hlen * sizeof (guint64));
/*
* To generate a set of hashes we just apply sha256 to the
* initial key as many times as many hashes are required and
* xor left and right parts of sha256 to get a single 16 bytes SIP key.
*/
rspamd_cryptobox_hash_update (&bs, cur_key, 16);
rspamd_cryptobox_hash_final (&bs, shabuf);

for (j = 0; j < 16; j ++) {
out_key[j] = shabuf[j];
}

rspamd_cryptobox_hash_init (&bs, NULL, 0);
cur_key = out_key;
out_key += 16;
}

switch (alg) {
case RSPAMD_SHINGLES_OLD:
ht = RSPAMD_CRYPTOBOX_MUMHASH;
break;
case RSPAMD_SHINGLES_XXHASH:
ht = RSPAMD_CRYPTOBOX_XXHASH64;
break;
case RSPAMD_SHINGLES_MUMHASH:
ht = RSPAMD_CRYPTOBOX_MUMHASH;
break;
default:
ht = RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT;
break;
}

memset (res, 0, sizeof (res));

for (i = 0; i <= 64; i ++) {
if (i - beg >= SHINGLES_WINDOW || i == 64) {
for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
/* Shift hashes window to right */
for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
res[j * SHINGLES_WINDOW + k] =
res[j * SHINGLES_WINDOW + k + 1];
}

d = dct[beg];
/* Insert the last element to the pipe */
memcpy (&seed, keys[j], sizeof (seed));
res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
rspamd_cryptobox_fast_hash_specific (ht,
&d, sizeof (d),
seed);
val = 0;
for (k = 0; k < SHINGLES_WINDOW; k ++) {
val ^= res[j * SHINGLES_WINDOW + k] >>
(8 * (SHINGLES_WINDOW - k - 1));
}

g_assert (hlen > beg);
hashes[j][beg] = val;
}
beg++;
}
}

/* Now we need to filter all hashes and make a shingles result */
for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
shingle->hashes[i] = filter (hashes[i], hlen,
i, key, filterd);
g_slice_free1 (hlen * sizeof (guint64), hashes[i]);
}

g_slice_free1 (sizeof (*hashes) * RSPAMD_SHINGLE_SIZE, hashes);

return shingle;
}


guint64
rspamd_shingles_default_filter (guint64 *input, gsize count,

+ 17
- 1
src/libutil/shingles.h View File

@@ -51,7 +51,23 @@ typedef guint64 (*rspamd_shingles_filter) (guint64 *input, gsize count,
* @param filterd opaque data for filtering function
* @return shingles array
*/
struct rspamd_shingle* rspamd_shingles_generate (GArray *input,
struct rspamd_shingle* rspamd_shingles_from_text (GArray *input,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
gpointer filterd,
enum rspamd_shingle_alg alg);

/**
* Generate shingles from the DCT matrix of an image
* @param dct discrete cosine transfor matrix (must be 64x64)
* @param key secret key used to generate shingles
* @param pool pool to allocate shigles array
* @param filter hashes filtering function
* @param filterd opaque data for filtering function
* @return shingles array
*/
struct rspamd_shingle* rspamd_shingles_from_image (gdouble *dct,
const guchar key[16],
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,

Loading…
Cancel
Save