]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Rework images fuzzy hashes algorithm
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 8 Dec 2016 18:35:12 +0000 (18:35 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 8 Dec 2016 18:35:12 +0000 (18:35 +0000)
src/libmime/images.c
src/libmime/images.h
src/plugins/fuzzy_check.c

index e6fe8ee4b18d192feb2923cfe43a121b7256f864..81bdcce2272d0343bfd7cbcdb8aa2a6a73e4be60 100644 (file)
@@ -214,9 +214,7 @@ rspamd_image_normalize (struct rspamd_task *task, struct rspamd_image *img)
 {
 #ifdef WITH_GD
        gdImagePtr src = NULL, dst = NULL;
-       guint nw, nh, i, j, b = 0;
-       gdouble avg, sum;
-       guchar sig[rspamd_cryptobox_HASHBYTES];
+       guint nw, nh, i, j, b = 0, nmax, nmin;
 
        if (img->data->len == 0 || img->data->len > G_MAXINT32) {
                return;
@@ -258,18 +256,20 @@ rspamd_image_normalize (struct rspamd_task *task, struct rspamd_image *img)
                gdImageGrayScale (dst);
                gdImageDestroy (src);
 
-               img->normalized_data = g_array_sized_new (FALSE, FALSE, sizeof (gint),
-                               nh * nw);
-
-               avg = 0;
+               img->is_normalized = TRUE;
+               nmax = 0;
+               nmin = G_MAXUINT;
 
                /* Calculate moving average */
                for (i = 0; i < nh; i ++) {
                        for (j = 0; j < nw; j ++) {
-                               gint px = gdImageGetPixel (dst, j, i);
-                               avg += (px - avg) / (gdouble)(i * nh + j + 1);
-
-                               g_array_append_val (img->normalized_data, px);
+                               guint px = (guint)gdImageGetPixel (dst, j, i);
+                               if (px > nmax) {
+                                       nmax = px;
+                               }
+                               if (px < nmin) {
+                                       nmin = px;
+                               }
                        }
                }
 
@@ -279,7 +279,7 @@ rspamd_image_normalize (struct rspamd_task *task, struct rspamd_image *img)
                 * ****
                 * ****
                 *
-                * Get sum of saturation values, and set bit if sum is > avg * 4
+                * Get sum of saturation values, and set bit if sum is > avg
                 * Then go further
                 *
                 * ****
@@ -287,58 +287,58 @@ rspamd_image_normalize (struct rspamd_task *task, struct rspamd_image *img)
                 *
                 * and repeat this algorithm.
                 *
-                * So on each iteration we move by 16 pixels and calculate 2 bits of signature
-                * hence, we produce ({64} / {4}) ^ 2 * 2 == 512 bits
+                * So on each iteration we move by 16 pixels and calculate 2 elements of
+                * signature
                 */
                for (i = 0; i < nh; i += 4) {
                        for (j = 0; j < nw; j += 4) {
-                               gint p[8];
-
-                               p[0] = g_array_index (img->normalized_data, gint, i * nh + j);
-                               p[1] = g_array_index (img->normalized_data, gint, i * nh + j + 1);
-                               p[2] = g_array_index (img->normalized_data, gint, i * nh + j + 2);
-                               p[3] = g_array_index (img->normalized_data, gint, i * nh + j + 3);
-                               p[4] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j);
-                               p[5] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j + 1);
-                               p[6] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j + 2);
-                               p[7] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j + 3);
-                               sum = p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7];
-
-                               if (fabs (sum) >= fabs (avg * 8)) {
-                                       setbit (sig, b);
-                               }
-                               else {
-                                       clrbit (sig, b);
-                               }
-                               b ++;
-
-                               p[0] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j);
-                               p[1] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j + 1);
-                               p[2] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j + 2);
-                               p[3] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j + 3);
-                               p[4] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j);
-                               p[5] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j + 1);
-                               p[6] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j + 2);
-                               p[7] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j + 3);
-
-                               sum = p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7];
-
-                               if (fabs (sum) >= fabs (avg * 8)) {
-                                       setbit (sig, b);
-                               }
-                               else {
-                                       clrbit (sig, b);
-                               }
-                               b ++;
+                               guint p[8];
+                               guint64 n = 0;
+
+                               p[0] = nmax - (guint)gdImageGetPixel (dst, i, j) + nmin;
+                               p[1] = nmax - (guint)gdImageGetPixel (dst, i, j + 1) + nmin;
+                               p[2] = nmax - (guint)gdImageGetPixel (dst, i, j + 2) + nmin;
+                               p[3] = nmax - (guint)gdImageGetPixel (dst, i, j + 3) + nmin;
+                               p[4] = nmax - (guint)gdImageGetPixel (dst, i + 1, j) + nmin;
+                               p[5] = nmax - (guint)gdImageGetPixel (dst, i + 1, j + 1) + nmin;
+                               p[6] = nmax - (guint)gdImageGetPixel (dst, i + 1, j + 2) + nmin;
+                               p[7] = nmax - (guint)gdImageGetPixel (dst, i + 1, j + 3) + nmin;
+
+                               n |= ((guint64)(p[0] / (nmax - nmin) % 256)) << 0;
+                               n |= ((guint64)(p[1] / (nmax - nmin) % 256)) << 8;
+                               n |= ((guint64)(p[2] / (nmax - nmin) % 256)) << 16;
+                               n |= ((guint64)(p[3] / (nmax - nmin) % 256)) << 24;
+                               n |= ((guint64)(p[4] / (nmax - nmin) % 256)) << 32;
+                               n |= ((guint64)(p[5] / (nmax - nmin) % 256)) << 40;
+                               n |= ((guint64)(p[6] / (nmax - nmin) % 256)) << 48;
+                               n |= ((guint64)(p[7] / (nmax - nmin) % 256)) << 56;
+                               img->fuzzy_sig[b++] = n;
+
+                               p[0] = nmax - (guint)gdImageGetPixel (dst, i + 2, j) + nmin;
+                               p[1] = nmax - (guint)gdImageGetPixel (dst, i + 2, j + 1) + nmin;
+                               p[2] = nmax - (guint)gdImageGetPixel (dst, i + 2, j + 2) + nmin;
+                               p[3] = nmax - (guint)gdImageGetPixel (dst, i + 2, j + 3) + nmin;
+                               p[4] = nmax - (guint)gdImageGetPixel (dst, i + 3, j) + nmin;
+                               p[5] = nmax - (guint)gdImageGetPixel (dst, i + 3, j + 1) + nmin;
+                               p[6] = nmax - (guint)gdImageGetPixel (dst, i + 3, j + 2) + nmin;
+                               p[7] = nmax - (guint)gdImageGetPixel (dst, i + 3, j + 3) + nmin;
+
+                               n |= ((guint64)(p[0] / (nmax - nmin) % 256)) << 0;
+                               n |= ((guint64)(p[1] / (nmax - nmin) % 256)) << 8;
+                               n |= ((guint64)(p[2] / (nmax - nmin) % 256)) << 16;
+                               n |= ((guint64)(p[3] / (nmax - nmin) % 256)) << 24;
+                               n |= ((guint64)(p[4] / (nmax - nmin) % 256)) << 32;
+                               n |= ((guint64)(p[5] / (nmax - nmin) % 256)) << 40;
+                               n |= ((guint64)(p[6] / (nmax - nmin) % 256)) << 48;
+                               n |= ((guint64)(p[7] / (nmax - nmin) % 256)) << 56;
+                               img->fuzzy_sig[b++] = n;
                        }
                }
 
-               msg_debug_task ("avg: %.0f, sig: %32xs, bits: %d", avg, sig, b);
-               memcpy (img->fuzzy_sig, sig, sizeof (img->fuzzy_sig));
+               msg_debug_task ("min: %d, max: %d, sig: %32xs, elts: %d", nmin, nmax,
+                               (const char *)img->fuzzy_sig, b);
 
                gdImageDestroy (dst);
-               rspamd_mempool_add_destructor (task->task_pool, rspamd_array_free_hard,
-                               img->normalized_data);
        }
 #endif
 }
index 1b46954e0c38219080ec510cfd36e5938525befb..01d0afd2275f68d632a89dc83a4c11c1a1d6832c 100644 (file)
@@ -2,7 +2,6 @@
 #define IMAGES_H_
 
 #include "config.h"
-#include "cryptobox.h"
 
 struct html_image;
 struct rspamd_task;
@@ -19,13 +18,13 @@ enum rspamd_image_type {
 struct rspamd_image {
        struct rspamd_mime_part *parent;
        GByteArray *data;
-       GArray *normalized_data;
-       guchar fuzzy_sig[rspamd_cryptobox_HASHBYTES];
        const gchar *filename;
        struct html_image *html_image;
        enum rspamd_image_type type;
        guint32 width;
        guint32 height;
+       gboolean is_normalized;
+       guint64 fuzzy_sig[32];
 };
 
 /*
index 3213837a13aefe7dd6be0295746e2aaea16cd315..f37ef45d08c8391c5dc1da775e78104869cfbb60 100644 (file)
@@ -1318,12 +1318,16 @@ fuzzy_cmd_from_image_part (struct fuzzy_rule *rule,
                /*
                 * Generate shingles
                 */
-               for (i = 0; i < sizeof (img->fuzzy_sig); i += 2) {
-                       shingles[i / 2] = rspamd_cryptobox_fast_hash_specific (
-                                       RSPAMD_CRYPTOBOX_MUMHASH, &img->fuzzy_sig[i], 2, 0);
+               G_STATIC_ASSERT (G_N_ELEMENTS (img->fuzzy_sig) == RSPAMD_SHINGLE_SIZE);
+
+               for (i = 0; i < RSPAMD_SHINGLE_SIZE; i ++) {
+                       shingles[i] = rspamd_cryptobox_fast_hash_specific (
+                                       RSPAMD_CRYPTOBOX_MUMHASH,
+                                       (const guchar *)&img->fuzzy_sig[i],
+                                       sizeof (img->fuzzy_sig[i]), 0);
                }
                rspamd_cryptobox_hash (shcmd->basic.digest,
-                               img->fuzzy_sig, sizeof (img->fuzzy_sig),
+                               (const guchar *)img->fuzzy_sig, sizeof (img->fuzzy_sig),
                                rule->hash_key->str, rule->hash_key->len);
 
                msg_debug_pool ("loading shingles of type %s with key %*xs",
@@ -2211,7 +2215,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
                                                        g_ptr_array_add (res, io);
                                                }
 
-                                               if (image->normalized_data) {
+                                               if (image->is_normalized) {
                                                        io = fuzzy_cmd_from_image_part (rule, c, flag, value,
                                                                        task->task_pool,
                                                                        image);