]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Implement new algorithm for fuzzy hashes of images
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 8 Dec 2016 17:17:08 +0000 (17:17 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 8 Dec 2016 17:47:44 +0000 (17:47 +0000)
src/libmime/images.c
src/libmime/images.h

index a65c580bbb77d182374c97e8d2a7d3b1b3695344..57528fb5465d4578e38786ba3c4072c0092bc289 100644 (file)
@@ -21,8 +21,9 @@
 
 #ifdef WITH_GD
 #include "gd.h"
+#include <math.h>
 
-#define RSPAMD_NORMALIZED_DIM 64
+#define RSPAMD_NORMALIZED_DIM rspamd_cryptobox_HASHBYTES / 8
 #endif
 
 static const guint8 png_signature[] = {137, 80, 78, 71, 13, 10, 26, 10};
@@ -213,7 +214,9 @@ rspamd_image_normalize (struct rspamd_task *task, struct rspamd_image *img)
 {
 #ifdef WITH_GD
        gdImagePtr src = NULL, dst = NULL;
-       guint nw, nh, i, j;
+       guint nw, nh, i, j, b = 0;
+       gdouble avg, sum;
+       guchar sig[rspamd_cryptobox_HASHBYTES];
 
        if (img->data->len == 0 || img->data->len > G_MAXINT32) {
                return;
@@ -247,35 +250,91 @@ rspamd_image_normalize (struct rspamd_task *task, struct rspamd_image *img)
        }
        else {
                gdImageSetInterpolationMethod (src, GD_BILINEAR_FIXED);
-               nw = img->width;
-               nh = img->height;
 
-               if (nh > RSPAMD_NORMALIZED_DIM) {
-                       nw = nw * RSPAMD_NORMALIZED_DIM / nh;
-                       nw = nw ? nw : 1;
-                       nh = RSPAMD_NORMALIZED_DIM;
-               }
-
-               if (nw > RSPAMD_NORMALIZED_DIM) {
-                       nh = nh * RSPAMD_NORMALIZED_DIM / nw;
-                       nh = nh ? nh : 1;
-                       nw = RSPAMD_NORMALIZED_DIM;
-               }
+               nw = RSPAMD_NORMALIZED_DIM;
+               nh = RSPAMD_NORMALIZED_DIM;
 
                dst = gdImageScale (src, nw, nh);
+               gdImageGrayScale (dst);
                gdImageDestroy (src);
 
                img->normalized_data = g_array_sized_new (FALSE, FALSE, sizeof (gint),
                                nh * nw);
 
+               avg = 0;
+
+               /* Calculate moving average */
                for (i = 0; i < nh; i ++) {
                        for (j = 0; j < nw; j ++) {
                                gint px = gdImageGetPixel (dst, j, i);
+                               avg += (px - avg) / (gdouble)(i * nh + j + 1);
 
                                g_array_append_val (img->normalized_data, px);
                        }
                }
 
+               /*
+                * Split message into blocks:
+                *
+                * ****
+                * ****
+                *
+                * Get sum of saturation values, and set bit if sum is > avg * 4
+                * Then go further
+                *
+                * ****
+                * ****
+                *
+                * and repeat this algorithm.
+                *
+                * So on each iteration we move by 16 pixels and calculate 2 bits of signature
+                * hence, we produce ({64} / {4}) ^ 2 * 2 == 512 bits
+                */
+               for (i = 0; i < nh; i += 4) {
+                       for (j = 0; j < nw; j += 4) {
+                               gint p[8];
+
+                               p[0] = g_array_index (img->normalized_data, gint, i * nh + j);
+                               p[1] = g_array_index (img->normalized_data, gint, i * nh + j + 1);
+                               p[2] = g_array_index (img->normalized_data, gint, i * nh + j + 2);
+                               p[3] = g_array_index (img->normalized_data, gint, i * nh + j + 3);
+                               p[4] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j);
+                               p[5] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j + 1);
+                               p[6] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j + 2);
+                               p[7] = g_array_index (img->normalized_data, gint, (i + 1) * nh + j + 3);
+                               sum = p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7];
+
+                               if (fabs (sum) >= fabs (avg * 8)) {
+                                       setbit (sig, b);
+                               }
+                               else {
+                                       clrbit (sig, b);
+                               }
+                               b ++;
+
+                               p[0] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j);
+                               p[1] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j + 1);
+                               p[2] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j + 2);
+                               p[3] = g_array_index (img->normalized_data, gint, (i + 2) * nh + j + 3);
+                               p[4] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j);
+                               p[5] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j + 1);
+                               p[6] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j + 2);
+                               p[7] = g_array_index (img->normalized_data, gint, (i + 3) * nh + j + 3);
+
+                               sum = p[0] + p[1] + p[2] + p[3] + p[4] + p[5] + p[6] + p[7];
+
+                               if (fabs (sum) >= fabs (avg * 8)) {
+                                       setbit (sig, b);
+                               }
+                               else {
+                                       clrbit (sig, b);
+                               }
+                               b ++;
+                       }
+               }
+
+               msg_debug_task ("avg: %.0f, sig: %32xs, bits: %d", avg, sig, b);
+
                gdImageDestroy (dst);
                rspamd_mempool_add_destructor (task->task_pool, rspamd_array_free_hard,
                                img->normalized_data);
index 35833b6af186d7a9f15ea3aeceb3b2ef26d4ef37..1b46954e0c38219080ec510cfd36e5938525befb 100644 (file)
@@ -2,6 +2,7 @@
 #define IMAGES_H_
 
 #include "config.h"
+#include "cryptobox.h"
 
 struct html_image;
 struct rspamd_task;
@@ -19,6 +20,7 @@ struct rspamd_image {
        struct rspamd_mime_part *parent;
        GByteArray *data;
        GArray *normalized_data;
+       guchar fuzzy_sig[rspamd_cryptobox_HASHBYTES];
        const gchar *filename;
        struct html_image *html_image;
        enum rspamd_image_type type;