aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt3
-rw-r--r--src/fuzzy.c273
-rw-r--r--src/fuzzy.h40
-rw-r--r--test/rspamd_expression_test.c14
-rw-r--r--test/rspamd_fuzzy_test.c44
-rw-r--r--test/rspamd_test_suite.c1
-rw-r--r--test/tests.h3
7 files changed, 364 insertions, 14 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ae4b7f78..d61844e70 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -236,6 +236,7 @@ SET(RSPAMDSRC src/modules.c
src/protocol.c
src/perl.c
src/message.c
+ src/fuzzy.c
src/expressions.c
src/mem_pool.c
src/memcached.c
@@ -261,6 +262,7 @@ SET(TESTSRC test/rspamd_expression_test.c
test/rspamd_memcached_test.c
test/rspamd_mem_pool_test.c
test/rspamd_statfile_test.c
+ test/rspamd_fuzzy_test.c
test/rspamd_test_suite.c
test/rspamd_url_test.c)
@@ -268,6 +270,7 @@ SET(TESTDEPENDS src/mem_pool.c
src/hash.c
src/url.c
src/util.c
+ src/fuzzy.c
src/memcached.c
src/expressions.c
src/statfile.c)
diff --git a/src/fuzzy.c b/src/fuzzy.c
new file mode 100644
index 000000000..08814eaa1
--- /dev/null
+++ b/src/fuzzy.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2009, Rambler media
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+#include "fuzzy.h"
+
+#define ROLL_WINDOW_SIZE 9
+#define MIN_FUZZY_BLOCK_SIZE 3
+#define HASH_INIT 0x28021967
+
+struct roll_state {
+ uint32_t h[3];
+ char window[ROLL_WINDOW_SIZE];
+ int n;
+};
+
+static struct roll_state rs;
+
+
+/* Rolling hash function based on Adler-32 checksum */
+static uint32_t
+fuzzy_roll_hash (char c)
+{
+ /* Check window position */
+ if (rs.n == ROLL_WINDOW_SIZE) {
+ rs.n = 0;
+ }
+
+ rs.h[1] -= rs.h[0];
+ rs.h[1] += ROLL_WINDOW_SIZE * c;
+
+ rs.h[0] += c;
+ rs.h[0] -= rs.window[rs.n];
+
+ /* Save current symbol */
+ rs.window[rs.n] = c;
+ rs.n ++;
+
+ rs.h[2] <<= 5;
+ rs.h[2] ^= c;
+
+ return rs.h[1] + rs.h[2] + rs.h[3];
+}
+
+/* A simple non-rolling hash, based on the FNV hash */
+static uint32_t
+fuzzy_fnv_hash (char c, uint32_t hval)
+{
+ hval ^= c;
+ hval = hval += (hval<<1) + (hval<<4) + (hval<<7) + (hval<<8) + (hval<<24);
+ return hval;
+}
+
+/* Calculate blocksize depending on length of input */
+static uint32_t
+fuzzy_blocksize (uint32_t len)
+{
+
+ if (len < MIN_FUZZY_BLOCK_SIZE) {
+ return MIN_FUZZY_BLOCK_SIZE;
+ }
+ return g_spaced_primes_closest (len / FUZZY_HASHLEN);
+}
+
+/* Update hash with new symbol */
+void
+fuzzy_update (fuzzy_hash_t *h, char c)
+{
+ h->rh = fuzzy_roll_hash (c);
+ h->h = fuzzy_fnv_hash(c, h->h);
+
+ if (h->rh % h->block_size == (h->block_size - 1)) {
+ h->hash_pipe[h->hi] = h->h;
+ if (h->hi < FUZZY_HASHLEN - 1) {
+ h->h = HASH_INIT;
+ h->hi ++;
+ }
+ }
+}
+
+/*
+ * Levenshtein distance between string1 and string2.
+ *
+ * Replace cost is normally 1, and 2 with nonzero xcost.
+ */
+static uint32_t
+lev_distance (char *s1, int len1, char *s2, int len2)
+{
+ int i;
+ int *row; /* we only need to keep one row of costs */
+ int *end;
+ int half, nx;
+ char *sx, *char2p, char1;
+ int *p, D, x, offset, c3;
+
+ /* strip common prefix */
+ while (len1 > 0 && len2 > 0 && *s1 == *s2) {
+ len1 --;
+ len2 --;
+ s1 ++;
+ s2 ++;
+ }
+
+ /* strip common suffix */
+ while (len1 > 0 && len2 > 0 && s1[len1-1] == s2[len2-1]) {
+ len1--;
+ len2--;
+ }
+
+ /* catch trivial cases */
+ if (len1 == 0) {
+ return len2;
+ }
+
+ if (len2 == 0) {
+ return len1;
+ }
+
+ /* make the inner cycle (i.e. string2) the longer one */
+ if (len1 > len2) {
+ nx = len1;
+ sx = s1;
+ len1 = len2;
+ len2 = nx;
+ s1 = s2;
+ s2 = sx;
+ }
+ /* check len1 == 1 separately */
+ if (len1 == 1) {
+ return len2 - (memchr (s2, *s1, len2) != NULL);
+ }
+
+ len1 ++;
+ len2 ++;
+ half = len1 >> 1;
+
+ /* initalize first row */
+ row = g_malloc (len2 * sizeof(int));
+ end = row + len2 - 1;
+ for (i = 0; i < len2; i++) {
+ row[i] = i;
+ }
+
+ /* in this case we don't have to scan two corner triangles (of size len1/2)
+ * in the matrix because no best path can go throught them. note this
+ * breaks when len1 == len2 == 2 so the memchr() special case above is
+ * necessary */
+ row[0] = len1 - half - 1;
+ for (i = 1; i < len1; i++) {
+ char1 = s1[i - 1];
+ /* skip the upper triangle */
+ if (i >= len1 - half) {
+ offset = i - (len1 - half);
+ char2p = s2 + offset;
+ p = row + offset;
+ c3 = *(p++) + (char1 != *(char2p++));
+ x = *p;
+ x++;
+ D = x;
+ if (x > c3)
+ x = c3;
+ *(p++) = x;
+ }
+ else {
+ p = row + 1;
+ char2p = s2;
+ D = x = i;
+ }
+ /* skip the lower triangle */
+ if (i <= half + 1)
+ end = row + len2 + i - half - 2;
+ /* main */
+ while (p <= end) {
+ c3 = --D + (char1 != *(char2p++));
+ x++;
+ if (x > c3)
+ x = c3;
+ D = *p;
+ D++;
+ if (x > D)
+ x = D;
+ *(p++) = x;
+ }
+ /* lower triangle sentinel */
+ if (i <= half) {
+ c3 = --D + (char1 != *char2p);
+ x++;
+ if (x > c3)
+ x = c3;
+ *p = x;
+ }
+ }
+
+ i = *end;
+ g_free (row);
+ return i;
+}
+
+/* Calculate fuzzy hash for specified string */
+fuzzy_hash_t *
+fuzzy_init (f_str_t *in, memory_pool_t *pool)
+{
+ fuzzy_hash_t *new;
+ int i, repeats = 0;
+ char *c = in->begin, last = '\0';
+
+ new = memory_pool_alloc0 (pool, sizeof (fuzzy_hash_t));
+ bzero (&rs, sizeof (rs));
+ new->block_size = fuzzy_blocksize (in->len);
+
+ for (i = 0; i < in->len; i ++) {
+ if (*c == last) {
+ repeats ++;
+ }
+ else {
+ repeats = 0;
+ }
+ if (!g_ascii_isspace (*c) && !g_ascii_ispunct (*c) && repeats < 3) {
+ fuzzy_update (new, *c);
+ }
+ last = *c;
+ c ++;
+ }
+
+ return new;
+}
+
+/* Compare score of difference between two hashes 0 - different hashes, 100 - identical hashes */
+int
+fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2)
+{
+ int res, l1, l2;
+
+ /* If we have hashes of different size, input strings are too different */
+ if (h1->block_size != h2->block_size) {
+ return 100;
+ }
+
+ l1 = strlen (h1->hash_pipe);
+ l2 = strlen (h2->hash_pipe);
+ res = lev_distance (h1->hash_pipe, l1, h2->hash_pipe, l2);
+ res = (res * 100) / (l1 + l2);
+
+ return res;
+}
+
+/*
+ * vi:ts=4
+ */
diff --git a/src/fuzzy.h b/src/fuzzy.h
new file mode 100644
index 000000000..91e6512c6
--- /dev/null
+++ b/src/fuzzy.h
@@ -0,0 +1,40 @@
+/**
+ * @file fuzzy.h
+ * Fuzzy hashes API
+ */
+
+#ifndef RSPAMD_FUZZY_H
+#define RSPAMD_FUZZY_H
+
+#include "config.h"
+#include "mem_pool.h"
+#include "fstring.h"
+
+#define FUZZY_HASHLEN 64
+
+typedef struct fuzzy_hash_s {
+ char hash_pipe[FUZZY_HASHLEN]; /**< result hash */
+ uint32_t block_size; /**< current blocksize */
+ uint32_t rh; /**< roll hash value */
+ uint32_t h; /**< hash of block */
+ uint32_t hi; /**< current index in hash pipe */
+} fuzzy_hash_t;
+
+/**
+ * Calculate fuzzy hash for specified string
+ * @param in input string
+ * @param pool pool object
+ * @return fuzzy_hash object allocated in pool
+ */
+fuzzy_hash_t * fuzzy_init (f_str_t *in, memory_pool_t *pool);
+
+/**
+ * Compare score of difference between two hashes
+ * @param h1 first hash
+ * @param h2 second hash
+ * @return result in percents 0 - different hashes, 100 - identical hashes
+ */
+int fuzzy_compare_hashes (fuzzy_hash_t *h1, fuzzy_hash_t *h2);
+
+
+#endif
diff --git a/test/rspamd_expression_test.c b/test/rspamd_expression_test.c
index c81d3d381..7b723e9e6 100644
--- a/test/rspamd_expression_test.c
+++ b/test/rspamd_expression_test.c
@@ -1,17 +1,3 @@
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <sys/param.h>
-
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <syslog.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
#include "../src/config.h"
#include "../src/main.h"
#include "../src/cfg_file.h"
diff --git a/test/rspamd_fuzzy_test.c b/test/rspamd_fuzzy_test.c
new file mode 100644
index 000000000..d737a9171
--- /dev/null
+++ b/test/rspamd_fuzzy_test.c
@@ -0,0 +1,44 @@
+#include "../src/config.h"
+#include "../src/main.h"
+#include "../src/fuzzy.h"
+#include "tests.h"
+
+static char *s1 = "This is sample test text.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n";
+static char *s2 = "This is sample test text.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopzrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n"
+ "abcdefghijklmnopqrstuvwx.\r\n";
+
+void
+rspamd_fuzzy_test_func ()
+{
+ memory_pool_t *pool;
+ fuzzy_hash_t *h1, *h2;
+ f_str_t f1, f2;
+
+ pool = memory_pool_new (1024);
+ f1.begin = s1;
+ f1.len = strlen (s1);
+ f2.begin = s2;
+ f2.len = strlen (s2);
+
+ h1 = fuzzy_init (&f1, pool);
+ h2 = fuzzy_init (&f2, pool);
+
+ msg_info ("rspamd_fuzzy_test_func: difference between strings is %d", fuzzy_compare_hashes (h1, h2));
+
+ memory_pool_delete (pool);
+}
diff --git a/test/rspamd_test_suite.c b/test/rspamd_test_suite.c
index 81291aa06..b3bc6c893 100644
--- a/test/rspamd_test_suite.c
+++ b/test/rspamd_test_suite.c
@@ -22,6 +22,7 @@ main (int argc, char **argv)
g_test_add_func ("/rspamd/memcached", rspamd_memcached_test_func);
g_test_add_func ("/rspamd/mem_pool", rspamd_mem_pool_test_func);
+ g_test_add_func ("/rspamd/fuzzy", rspamd_fuzzy_test_func);
g_test_add_func ("/rspamd/url", rspamd_url_test_func);
g_test_add_func ("/rspamd/expression", rspamd_expression_test_func);
g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func);
diff --git a/test/tests.h b/test/tests.h
index ed852f2d3..c3692d460 100644
--- a/test/tests.h
+++ b/test/tests.h
@@ -17,6 +17,9 @@ void rspamd_mem_pool_test_func ();
/* Expressions */
void rspamd_expression_test_func ();
+/* Fuzzy hashes */
+void rspamd_fuzzy_test_func ();
+
/* Stat file */
void rspamd_statfile_test_func ();