]> source.dussan.org Git - rspamd.git/commitdiff
* Add fuzzy hashes implementation (with adler-32 roll hash and FNV-32 hash)
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 20 Mar 2009 15:35:30 +0000 (18:35 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 20 Mar 2009 15:35:30 +0000 (18:35 +0300)
* Add test case for fuzzy hashing check

CMakeLists.txt
src/fuzzy.c [new file with mode: 0644]
src/fuzzy.h [new file with mode: 0644]
test/rspamd_expression_test.c
test/rspamd_fuzzy_test.c [new file with mode: 0644]
test/rspamd_test_suite.c
test/tests.h

index 7ae4b7f78529450d074cd214181ecf22e3d4986d..d61844e709b182eea86dcbdca030589fcdaa9a15 100644 (file)
@@ -236,6 +236,7 @@ SET(RSPAMDSRC       src/modules.c
                                src/protocol.c
                                src/perl.c
                                src/message.c
+                               src/fuzzy.c
                                src/expressions.c
                                src/mem_pool.c
                                src/memcached.c
@@ -261,6 +262,7 @@ SET(TESTSRC         test/rspamd_expression_test.c
                                test/rspamd_memcached_test.c
                                test/rspamd_mem_pool_test.c
                                test/rspamd_statfile_test.c
+                               test/rspamd_fuzzy_test.c
                                test/rspamd_test_suite.c
                                test/rspamd_url_test.c)
 
@@ -268,6 +270,7 @@ SET(TESTDEPENDS     src/mem_pool.c
                                src/hash.c
                                src/url.c
                                src/util.c
+                               src/fuzzy.c
                                src/memcached.c
                                src/expressions.c
                                src/statfile.c)
diff --git a/src/fuzzy.c b/src/fuzzy.c
new file mode 100644 (file)
index 0000000..08814ea
--- /dev/null
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include "fuzzy.h"
+
+#define ROLL_WINDOW_SIZE 9
+#define MIN_FUZZY_BLOCK_SIZE 3
+#define HASH_INIT      0x28021967
+
+struct roll_state {
+       uint32_t h[3];
+       char window[ROLL_WINDOW_SIZE];
+       int n;
+};
+
+static struct roll_state rs;
+
+
+/* Rolling hash function based on Adler-32 checksum */
+static uint32_t 
+fuzzy_roll_hash (char c)
+{
+       /* Check window position */
+       if (rs.n == ROLL_WINDOW_SIZE) {
+               rs.n = 0;
+       }
+
+       rs.h[1] -= rs.h[0];
+       rs.h[1] += ROLL_WINDOW_SIZE * c;
+  
+       rs.h[0] += c;
+       rs.h[0] -= rs.window[rs.n];
+       
+       /* Save current symbol */
+       rs.window[rs.n] = c;
+       rs.n ++;
+  
+       rs.h[2] <<= 5;
+       rs.h[2] ^= c;
+  
+       return rs.h[1] + rs.h[2] + rs.h[3];
+}
+
+/* A simple non-rolling hash, based on the FNV hash */
+static uint32_t 
+fuzzy_fnv_hash (char c, uint32_t hval)
+{
+       hval ^= c;
+       hval = hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
+       return hval;
+}
+
+/* Calculate blocksize depending on length of input */
+static uint32_t
+fuzzy_blocksize (uint32_t len) 
+{
+       
+       if (len < MIN_FUZZY_BLOCK_SIZE) {
+               return MIN_FUZZY_BLOCK_SIZE;
+       }
+       return g_spaced_primes_closest (len / FUZZY_HASHLEN);
+}
+
+/* Update hash with new symbol */
+void
+fuzzy_update (fuzzy_hash_t *h, char c)
+{
+       h->rh = fuzzy_roll_hash (c);
+       h->h = fuzzy_fnv_hash(c, h->h);
+    
+       if (h->rh % h->block_size == (h->block_size - 1)) {
+               h->hash_pipe[h->hi] = h->h;
+               if (h->hi < FUZZY_HASHLEN - 1) {
+                       h->h = HASH_INIT;
+                       h->hi ++;
+               }
+       }
+}
+
+/*
+ * Levenshtein distance between string1 and string2.
+ *
+ * Replace cost is normally 1, and 2 with nonzero xcost.
+ */
+static uint32_t
+lev_distance (char *s1, int len1, char *s2, int len2)
+{      
+       int i;
+       int *row;  /* we only need to keep one row of costs */
+       int *end;
+       int half, nx;
+       char *sx, *char2p, char1;
+       int *p, D, x, offset, c3;
+
+       /* strip common prefix */
+       while (len1 > 0 && len2 > 0 && *s1 == *s2) {
+               len1 --;
+               len2 --;
+               s1 ++;
+               s2 ++;
+       }
+
+       /* strip common suffix */
+       while (len1 > 0 && len2 > 0 && s1[len1-1] == s2[len2-1]) {
+               len1--;
+               len2--;
+       }
+
+       /* catch trivial cases */
+       if (len1 == 0) {
+               return len2;
+       }
+
+       if (len2 == 0) {
+               return len1;
+       }
+
+       /* make the inner cycle (i.e. string2) the longer one */
+       if (len1 > len2) {
+               nx = len1;
+               sx = s1;
+               len1 = len2;
+               len2 = nx;
+               s1 = s2;
+               s2 = sx;
+       }
+       /* check len1 == 1 separately */
+       if (len1 == 1) {
+               return len2 - (memchr (s2, *s1, len2) != NULL);
+       }
+
+       len1 ++;
+       len2 ++;
+       half = len1 >> 1;
+
+       /* initalize first row */
+       row = g_malloc (len2 * sizeof(int));
+       end = row + len2 - 1;
+       for (i = 0; i < len2; i++) {
+               row[i] = i;
+       }
+
+       /* in this case we don't have to scan two corner triangles (of size len1/2)
+        * in the matrix because no best path can go throught them. note this
+        * breaks when len1 == len2 == 2 so the memchr() special case above is
+        * necessary */
+       row[0] = len1 - half - 1;
+       for (i = 1; i < len1; i++) {
+               char1 = s1[i - 1];
+               /* skip the upper triangle */
+               if (i >= len1 - half) {
+                       offset = i - (len1 - half);
+                       char2p = s2 + offset;
+                       p = row + offset;
+                       c3 = *(p++) + (char1 != *(char2p++));
+                       x = *p;
+                       x++;
+                       D = x;
+                       if (x > c3)
+                               x = c3;
+                       *(p++) = x;
+               }
+               else {
+                       p = row + 1;
+                       char2p = s2;
+                       D = x = i;
+               }
+               /* skip the lower triangle */
+               if (i <= half + 1)
+                       end = row + len2 + i - half - 2;
+               /* main */
+               while (p <= end) {
+                       c3 = --D + (char1 != *(char2p++));
+                       x++;
+                       if (x > c3)
+                               x = c3;
+                       D = *p;
+                       D++;
+                       if (x > D)
+                               x = D;
+                       *(p++) = x;
+               }
+               /* lower triangle sentinel */
+               if (i <= half) {
+                       c3 = --D + (char1 != *char2p);
+                       x++;
+                       if (x > c3)
+                               x = c3;
+                       *p = x;
+               }
+       }
+
+       i = *end;
+       g_free (row);
+       return i;
+}
+
+/* Calculate fuzzy hash for specified string */
+fuzzy_hash_t *
+fuzzy_init (f_str_t *in, memory_pool_t *pool)
+{
+       fuzzy_hash_t *new;
+       int i, repeats = 0;
+       char *c = in->begin, last = '\0';
+
+       new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
+       bzero (&rs, sizeof (rs));
+       new->block_size = fuzzy_blocksize (in->len);
+       
+       for (i = 0; i < in->len; i ++) {
+               if (*c == last) {
+                       repeats ++;
+               }
+               else {
+                       repeats = 0;
+               }
+               if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
+                       fuzzy_update (new, *c);
+               }
+               last = *c;
+               c ++;
+       }
+
+       return new;
+}
+
+/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
+int
+fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2) 
+{
+       int res, l1, l2;
+
+       /* If we have hashes of different size, input strings are too different */
+       if (h1->block_size != h2->block_size) {
+               return 100;
+       }
+       
+       l1 = strlen (h1->hash_pipe);
+       l2 = strlen (h2->hash_pipe);
+       res = lev_distance (h1->hash_pipe, l1, h2->hash_pipe, l2);
+       res = (res * 100) / (l1 + l2);
+
+       return res;
+}
+
+/* 
+ * vi:ts=4 
+ */
diff --git a/src/fuzzy.h b/src/fuzzy.h
new file mode 100644 (file)
index 0000000..91e6512
--- /dev/null
@@ -0,0 +1,40 @@
+/**
+ * @file fuzzy.h
+ * Fuzzy hashes API
+ */
+
+#ifndef RSPAMD_FUZZY_H
+#define RSPAMD_FUZZY_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+
+#define FUZZY_HASHLEN 64
+
+typedef struct fuzzy_hash_s {
+       char hash_pipe[FUZZY_HASHLEN];                  /**< result hash                                        */
+       uint32_t block_size;                                    /**< current blocksize                          */
+       uint32_t rh;                                                    /**< roll hash value                            */
+       uint32_t h;                                                             /**< hash of block                                      */
+       uint32_t hi;                                                    /**< current index in hash pipe         */
+} fuzzy_hash_t;
+
+/**
+ * Calculate fuzzy hash for specified string
+ * @param in input string
+ * @param pool pool object
+ * @return fuzzy_hash object allocated in pool
+ */
+fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
+
+/**
+ * Compare score of difference between two hashes 
+ * @param h1 first hash
+ * @param h2 second hash
+ * @return result in percents 0 - different hashes, 100 - identical hashes 
+ */
+int fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2);
+
+
+#endif
index c81d3d3814b0f8ddcd1a832f2d45a26f255563ee..7b723e9e6f7914d598d636d44021ad9a528ed911 100644 (file)
@@ -1,17 +1,3 @@
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <sys/param.h>
-
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <syslog.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
 #include "../src/config.h"
 #include "../src/main.h"
 #include "../src/cfg_file.h"
diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c
new file mode 100644 (file)
index 0000000..d737a91
--- /dev/null
@@ -0,0 +1,44 @@
+#include "../src/config.h"
+#include "../src/main.h"
+#include "../src/fuzzy.h"
+#include "tests.h"
+
+static char *s1 = "This is sample test text.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n";
+static char *s2 = "This is sample test text.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopzrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n"
+                                 "abcdefghijklmnopqrstuvwx.\r\n";
+
+void 
+rspamd_fuzzy_test_func ()
+{
+       memory_pool_t *pool;
+       fuzzy_hash_t *h1, *h2;
+       f_str_t f1, f2;
+
+       pool = memory_pool_new (1024);
+       f1.begin = s1;
+       f1.len = strlen (s1);
+       f2.begin = s2;
+       f2.len = strlen (s2);
+
+       h1 = fuzzy_init (&f1, pool);
+       h2 = fuzzy_init (&f2, pool);
+
+       msg_info ("rspamd_fuzzy_test_func: difference between strings is %d", fuzzy_compare_hashes (h1, h2));
+
+       memory_pool_delete (pool);
+}
index 81291aa060472fb6c846bf14c5d85a6ac80584b5..b3bc6c89392fba158ba5d0e79a589fdb85186673 100644 (file)
@@ -22,6 +22,7 @@ main (int argc, char **argv)
 
        g_test_add_func ("/rspamd/memcached", rspamd_memcached_test_func);
        g_test_add_func ("/rspamd/mem_pool", rspamd_mem_pool_test_func);
+       g_test_add_func ("/rspamd/fuzzy", rspamd_fuzzy_test_func);
        g_test_add_func ("/rspamd/url", rspamd_url_test_func);
        g_test_add_func ("/rspamd/expression", rspamd_expression_test_func);
        g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func);
index ed852f2d378fb6e752822ebaa3c7557d81ca15f4..c3692d460685dcc14740cd867f3814bc9c07a78b 100644 (file)
@@ -17,6 +17,9 @@ void rspamd_mem_pool_test_func ();
 /* Expressions */
 void rspamd_expression_test_func ();
 
+/* Fuzzy hashes */
+void rspamd_fuzzy_test_func ();
+
 /* Stat file */
 void rspamd_statfile_test_func ();