diff options
-rw-r--r-- | CMakeLists.txt | 3 | ||||
-rw-r--r-- | src/fuzzy.c | 273 | ||||
-rw-r--r-- | src/fuzzy.h | 40 | ||||
-rw-r--r-- | test/rspamd_expression_test.c | 14 | ||||
-rw-r--r-- | test/rspamd_fuzzy_test.c | 44 | ||||
-rw-r--r-- | test/rspamd_test_suite.c | 1 | ||||
-rw-r--r-- | test/tests.h | 3 |
7 files changed, 364 insertions, 14 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ae4b7f78..d61844e70 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,6 +236,7 @@ SET(RSPAMDSRC src/modules.c src/protocol.c src/perl.c src/message.c + src/fuzzy.c src/expressions.c src/mem_pool.c src/memcached.c @@ -261,6 +262,7 @@ SET(TESTSRC test/rspamd_expression_test.c test/rspamd_memcached_test.c test/rspamd_mem_pool_test.c test/rspamd_statfile_test.c + test/rspamd_fuzzy_test.c test/rspamd_test_suite.c test/rspamd_url_test.c) @@ -268,6 +270,7 @@ SET(TESTDEPENDS src/mem_pool.c src/hash.c src/url.c src/util.c + src/fuzzy.c src/memcached.c src/expressions.c src/statfile.c) diff --git a/src/fuzzy.c b/src/fuzzy.c new file mode 100644 index 000000000..08814eaa1 --- /dev/null +++ b/src/fuzzy.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "config.h" +#include "mem_pool.h" +#include "fstring.h" +#include "fuzzy.h" + +#define ROLL_WINDOW_SIZE 9 +#define MIN_FUZZY_BLOCK_SIZE 3 +#define HASH_INIT 0x28021967 + +struct roll_state { + uint32_t h[3]; + char window[ROLL_WINDOW_SIZE]; + int n; +}; + +static struct roll_state rs; + + +/* Rolling hash function based on Adler-32 checksum */ +static uint32_t +fuzzy_roll_hash (char c) +{ + /* Check window position */ + if (rs.n == ROLL_WINDOW_SIZE) { + rs.n = 0; + } + + rs.h[1] -= rs.h[0]; + rs.h[1] += ROLL_WINDOW_SIZE * c; + + rs.h[0] += c; + rs.h[0] -= rs.window[rs.n]; + + /* Save current symbol */ + rs.window[rs.n] = c; + rs.n ++; + + rs.h[2] <<= 5; + rs.h[2] ^= c; + + return rs.h[1] + rs.h[2] + rs.h[3]; +} + +/* A simple non-rolling hash, based on the FNV hash */ +static uint32_t +fuzzy_fnv_hash (char c, uint32_t hval) +{ + hval ^= c; + hval = hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24); + return hval; +} + +/* Calculate blocksize depending on length of input */ +static uint32_t +fuzzy_blocksize (uint32_t len) +{ + + if (len < MIN_FUZZY_BLOCK_SIZE) { + return MIN_FUZZY_BLOCK_SIZE; + } + return g_spaced_primes_closest (len / FUZZY_HASHLEN); +} + +/* Update hash with new symbol */ +void +fuzzy_update (fuzzy_hash_t *h, char c) +{ + h->rh = fuzzy_roll_hash (c); + h->h = fuzzy_fnv_hash(c, h->h); + + if (h->rh % h->block_size == (h->block_size - 1)) { + h->hash_pipe[h->hi] = h->h; + if (h->hi < FUZZY_HASHLEN - 1) { + h->h = HASH_INIT; + h->hi ++; + } + } +} + +/* + * Levenshtein distance between string1 and string2. + * + * Replace cost is normally 1, and 2 with nonzero xcost. + */ +static uint32_t +lev_distance (char *s1, int len1, char *s2, int len2) +{ + int i; + int *row; /* we only need to keep one row of costs */ + int *end; + int half, nx; + char *sx, *char2p, char1; + int *p, D, x, offset, c3; + + /* strip common prefix */ + while (len1 > 0 && len2 > 0 && *s1 == *s2) { + len1 --; + len2 --; + s1 ++; + s2 ++; + } + + /* strip common suffix */ + while (len1 > 0 && len2 > 0 && s1[len1-1] == s2[len2-1]) { + len1--; + len2--; + } + + /* catch trivial cases */ + if (len1 == 0) { + return len2; + } + + if (len2 == 0) { + return len1; + } + + /* make the inner cycle (i.e. string2) the longer one */ + if (len1 > len2) { + nx = len1; + sx = s1; + len1 = len2; + len2 = nx; + s1 = s2; + s2 = sx; + } + /* check len1 == 1 separately */ + if (len1 == 1) { + return len2 - (memchr (s2, *s1, len2) != NULL); + } + + len1 ++; + len2 ++; + half = len1 >> 1; + + /* initalize first row */ + row = g_malloc (len2 * sizeof(int)); + end = row + len2 - 1; + for (i = 0; i < len2; i++) { + row[i] = i; + } + + /* in this case we don't have to scan two corner triangles (of size len1/2) + * in the matrix because no best path can go throught them. note this + * breaks when len1 == len2 == 2 so the memchr() special case above is + * necessary */ + row[0] = len1 - half - 1; + for (i = 1; i < len1; i++) { + char1 = s1[i - 1]; + /* skip the upper triangle */ + if (i >= len1 - half) { + offset = i - (len1 - half); + char2p = s2 + offset; + p = row + offset; + c3 = *(p++) + (char1 != *(char2p++)); + x = *p; + x++; + D = x; + if (x > c3) + x = c3; + *(p++) = x; + } + else { + p = row + 1; + char2p = s2; + D = x = i; + } + /* skip the lower triangle */ + if (i <= half + 1) + end = row + len2 + i - half - 2; + /* main */ + while (p <= end) { + c3 = --D + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + /* lower triangle sentinel */ + if (i <= half) { + c3 = --D + (char1 != *char2p); + x++; + if (x > c3) + x = c3; + *p = x; + } + } + + i = *end; + g_free (row); + return i; +} + +/* Calculate fuzzy hash for specified string */ +fuzzy_hash_t * +fuzzy_init (f_str_t *in, memory_pool_t *pool) +{ + fuzzy_hash_t *new; + int i, repeats = 0; + char *c = in->begin, last = '\0'; + + new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t)); + bzero (&rs, sizeof (rs)); + new->block_size = fuzzy_blocksize (in->len); + + for (i = 0; i < in->len; i ++) { + if (*c == last) { + repeats ++; + } + else { + repeats = 0; + } + if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) { + fuzzy_update (new, *c); + } + last = *c; + c ++; + } + + return new; +} + +/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */ +int +fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2) +{ + int res, l1, l2; + + /* If we have hashes of different size, input strings are too different */ + if (h1->block_size != h2->block_size) { + return 100; + } + + l1 = strlen (h1->hash_pipe); + l2 = strlen (h2->hash_pipe); + res = lev_distance (h1->hash_pipe, l1, h2->hash_pipe, l2); + res = (res * 100) / (l1 + l2); + + return res; +} + +/* + * vi:ts=4 + */ diff --git a/src/fuzzy.h b/src/fuzzy.h new file mode 100644 index 000000000..91e6512c6 --- /dev/null +++ b/src/fuzzy.h @@ -0,0 +1,40 @@ +/** + * @file fuzzy.h + * Fuzzy hashes API + */ + +#ifndef RSPAMD_FUZZY_H +#define RSPAMD_FUZZY_H + +#include "config.h" +#include "mem_pool.h" +#include "fstring.h" + +#define FUZZY_HASHLEN 64 + +typedef struct fuzzy_hash_s { + char hash_pipe[FUZZY_HASHLEN]; /**< result hash */ + uint32_t block_size; /**< current blocksize */ + uint32_t rh; /**< roll hash value */ + uint32_t h; /**< hash of block */ + uint32_t hi; /**< current index in hash pipe */ +} fuzzy_hash_t; + +/** + * Calculate fuzzy hash for specified string + * @param in input string + * @param pool pool object + * @return fuzzy_hash object allocated in pool + */ +fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool); + +/** + * Compare score of difference between two hashes + * @param h1 first hash + * @param h2 second hash + * @return result in percents 0 - different hashes, 100 - identical hashes + */ +int fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2); + + +#endif diff --git a/test/rspamd_expression_test.c b/test/rspamd_expression_test.c index c81d3d381..7b723e9e6 100644 --- a/test/rspamd_expression_test.c +++ b/test/rspamd_expression_test.c @@ -1,17 +1,3 @@ -#include <sys/types.h> -#include <sys/time.h> -#include <sys/wait.h> -#include <sys/param.h> - -#include <netinet/in.h> -#include <arpa/inet.h> -#include <netdb.h> -#include <syslog.h> -#include <fcntl.h> -#include <stdlib.h> -#include <string.h> -#include <stdio.h> - #include "../src/config.h" #include "../src/main.h" #include "../src/cfg_file.h" diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c new file mode 100644 index 000000000..d737a9171 --- /dev/null +++ b/test/rspamd_fuzzy_test.c @@ -0,0 +1,44 @@ +#include "../src/config.h" +#include "../src/main.h" +#include "../src/fuzzy.h" +#include "tests.h" + +static char *s1 = "This is sample test text.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n"; +static char *s2 = "This is sample test text.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopzrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n" + "abcdefghijklmnopqrstuvwx.\r\n"; + +void +rspamd_fuzzy_test_func () +{ + memory_pool_t *pool; + fuzzy_hash_t *h1, *h2; + f_str_t f1, f2; + + pool = memory_pool_new (1024); + f1.begin = s1; + f1.len = strlen (s1); + f2.begin = s2; + f2.len = strlen (s2); + + h1 = fuzzy_init (&f1, pool); + h2 = fuzzy_init (&f2, pool); + + msg_info ("rspamd_fuzzy_test_func: difference between strings is %d", fuzzy_compare_hashes (h1, h2)); + + memory_pool_delete (pool); +} diff --git a/test/rspamd_test_suite.c b/test/rspamd_test_suite.c index 81291aa06..b3bc6c893 100644 --- a/test/rspamd_test_suite.c +++ b/test/rspamd_test_suite.c @@ -22,6 +22,7 @@ main (int argc, char **argv) g_test_add_func ("/rspamd/memcached", rspamd_memcached_test_func); g_test_add_func ("/rspamd/mem_pool", rspamd_mem_pool_test_func); + g_test_add_func ("/rspamd/fuzzy", rspamd_fuzzy_test_func); g_test_add_func ("/rspamd/url", rspamd_url_test_func); g_test_add_func ("/rspamd/expression", rspamd_expression_test_func); g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func); diff --git a/test/tests.h b/test/tests.h index ed852f2d3..c3692d460 100644 --- a/test/tests.h +++ b/test/tests.h @@ -17,6 +17,9 @@ void rspamd_mem_pool_test_func (); /* Expressions */ void rspamd_expression_test_func (); +/* Fuzzy hashes */ +void rspamd_fuzzy_test_func (); + /* Stat file */ void rspamd_statfile_test_func (); |