From: Vsevolod Stakhov Date: Wed, 19 Aug 2009 16:08:16 +0000 (+0400) Subject: * Add implementation of counting bloom filter to rspamd X-Git-Tag: 0.2.7~47 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=8b8b79dd4f5bb0e8fd7b5d6f5fcd0a8050502862;p=rspamd.git * Add implementation of counting bloom filter to rspamd --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 7689d61cb..3024a07d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -337,6 +337,7 @@ SET(RSPAMDSRC src/modules.c src/radix.c src/view.c src/map.c + src/bloom.c src/symbols_cache.c src/fuzzy_storage.c) diff --git a/conf/rspamd.conf.sample b/conf/rspamd.conf.sample index 69fc6a525..926f9901e 100644 --- a/conf/rspamd.conf.sample +++ b/conf/rspamd.conf.sample @@ -123,6 +123,7 @@ factors { "R_SPAM_FROM_LIBERO" = 10; "R_FAKE_OUTLOOK"= 8; "R_FAKE_THEBAT"= 8; + "R_LOTTO" = 5; "KAM_LOTTO1" = 7; "FORGED_OUTLOOK_HTML" = 5; "SUSPICIOUS_RECIPS" = 3.5; @@ -153,11 +154,6 @@ factors { "DRUGS_MANYKINDS" = 2; "FAKE_REPLY_C" = 6; "MIME_HTML_ONLY" = 1; - # Voweling - "FROM_DOMAIN_NOVOWEL" = 7; - "FROM_LOCAL_NOVOWEL" = 8; - "FROM_LOCAL_HEX" = 8; - "FROM_LOCAL_DIGITS" = 8; # Modules factors "R_MIXED_CHARSET" = 5; @@ -244,6 +240,7 @@ factors { R_FAKE_OUTLOOK="${R_FAKE_OUTLOOK}"; R_FAKE_THEBAT="${R_FAKE_THEBAT}"; R_MISSING_CHARSET="${R_MISSING_CHARSET}"; + R_LOTTO="${R_LOTTO}"; KAM_LOTTO1="${KAM_LOTTO1}"; FORGED_OUTLOOK_HTML="${FORGED_OUTLOOK_HTML}"; SUSPICIOUS_RECIPS="${SUSPICIOUS_RECIPS}"; @@ -274,11 +271,6 @@ factors { FAKE_REPLY_C="${FAKE_REPLY_C}"; MIME_HTML_ONLY="${MIME_HTML_ONLY}"; - - FROM_DOMAIN_NOVOWEL="${FROM_DOMAIN_NOVOWEL}"; - FROM_LOCAL_NOVOWEL="${FROM_LOCAL_NOVOWEL}"; - FROM_LOCAL_HEX="${FROM_LOCAL_HEX}"; - FROM_LOCAL_DIGITS="${FROM_LOCAL_DIGITS}"; }; .module 'chartable' { @@ -305,7 +297,7 @@ filters = "surbl,regexp,chartable,emails"; view { # All directives here may be duplicated to add specific elements or regexp/files # List of ip/mask for this view - #ip = "file:///usr/local/etc/rspamd/ip_internal.inc"; + ip = "file:///usr/local/etc/rspamd/ip_internal.inc"; # From addresses for this view: # list is placed in file: #from = "file:///usr/local/etc/rspamd/from_internal.inc"; diff --git a/src/bloom.c b/src/bloom.c new file mode 100644 index 000000000..8c6bf8edc --- /dev/null +++ b/src/bloom.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "bloom.h" + +/* 4 bits are used for counting (implementing delete operation) */ +#define SIZE_BIT 4 + +/* These macroes are for 4 bits for counting element */ +#define INCBIT(a, n, acc) do { \ + acc = a[n * SIZE_BIT / CHAR_BIT] & (0xF << (n % (CHAR_BIT / SIZE_BIT) * SIZE_BIT)); \ + acc ++; \ + acc &= 0xF; \ + \ + a[n * SIZE_BIT / CHAR_BIT] &= (0xF << (4 - (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT))); \ + a[n * SIZE_BIT / CHAR_BIT] |= (acc << (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT)); \ +} while (0); + +#define DECBIT(a, n, acc) do { \ + acc = a[n * SIZE_BIT / CHAR_BIT] & (0xF << (n % (CHAR_BIT / SIZE_BIT) * SIZE_BIT)); \ + acc --; \ + acc &= 0xF; \ + \ + a[n * SIZE_BIT / CHAR_BIT] &= (0xF << (4 - (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT))); \ + a[n * SIZE_BIT / CHAR_BIT] |= (acc << (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT)); \ +} while (0); + +#define GETBIT(a, n) (a[n * SIZE_BIT / CHAR_BIT] & (0xF << (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT))) + +/* Common hash functions */ +unsigned int +bloom_sax_hash(const char *key) +{ + unsigned int h = 0; + + while(*key) h ^= (h<<5) + (h>>2) + (unsigned char)*key++; + + return h; +} + +unsigned int +bloom_sdbm_hash(const char *key) +{ + unsigned int h = 0; + + while(*key) h = (unsigned char)*key++ + (h<<6) + (h<<16) - h; + + return h; +} + +unsigned int +bloom_fnv_hash (const char *key) +{ + unsigned int h = 0; + + while (*key) { + h ^= (unsigned char)*key++; + h += (h<<1) + (h<<4) + (h<<7) + (h<<8) + (h<<24); + } + + return h; +} + +unsigned int +bloom_rs_hash (const char *key) +{ + unsigned int b = 378551; + unsigned int a = 63689; + unsigned int hash = 0; + + while (*key) { + hash = hash * a + (unsigned char)*key++; + a = a * b; + } + + return hash; +} + +unsigned int +bloom_js_hash (const char *key) +{ + unsigned int hash = 1315423911; + + while (*key) { + hash ^= ((hash << 5) + (unsigned char)*key++ + (hash >> 2)); + } + + return hash; +} + + +unsigned int +bloom_elf_hash (const char *key) +{ + unsigned int hash = 0; + unsigned int x = 0; + + while (*key) { + hash = (hash << 4) + (unsigned char)*key++; + if((x = hash & 0xF0000000L) != 0) { + hash ^= (x >> 24); + } + hash &= ~x; + } + + return hash; +} + + +unsigned int +bloom_bkdr_hash (const char *key) +{ + unsigned int seed = 131; /* 31 131 1313 13131 131313 etc.. */ + unsigned int hash = 0; + + while (*key) { + hash = (hash * seed) + (unsigned char)*key ++; + } + + return hash; +} + + +unsigned int +bloom_ap_hash (const char *key) +{ + unsigned int hash = 0xAAAAAAAA; + unsigned int i = 0; + + while (*key) { + hash ^= ((i & 1) == 0) ? ((hash << 7) ^ ((unsigned char)*key) * (hash >> 3)) : + (~((hash << 11) + (((unsigned char)*key) ^ (hash >> 5)))); + key++; + } + + return hash; +} + +bloom_filter_t * +bloom_create (size_t size, size_t nfuncs, ...) +{ + bloom_filter_t *bloom; + va_list l; + int n; + + if (!(bloom = g_malloc (sizeof (bloom_filter_t)))) { + return NULL; + } + if (!(bloom->a = g_new (char, (size + CHAR_BIT - 1) / CHAR_BIT * SIZE_BIT))) { + g_free (bloom); + return NULL; + } + if (!(bloom->funcs = (hashfunc_t *) g_malloc (nfuncs * sizeof (hashfunc_t)))) { + g_free (bloom->a); + g_free (bloom); + return NULL; + } + + va_start (l, nfuncs); + for (n = 0; n < nfuncs; ++n) { + bloom->funcs[n] = va_arg (l, hashfunc_t); + } + va_end (l); + + bloom->nfuncs = nfuncs; + bloom->asize = size; + + return bloom; +} + +void +bloom_destroy (bloom_filter_t *bloom) +{ + g_free (bloom->a); + g_free (bloom->funcs); + g_free (bloom); +} + +gboolean +bloom_add (bloom_filter_t *bloom, const char *s) +{ + size_t n; + u_char t; + + for (n = 0; n < bloom->nfuncs; ++n) { + INCBIT (bloom->a, bloom->funcs[n] (s) % bloom->asize, t); + } + + return TRUE; +} + +gboolean +bloom_del (bloom_filter_t *bloom, const char *s) +{ + size_t n; + u_char t; + + for (n = 0; n < bloom->nfuncs; ++n) { + DECBIT (bloom->a, bloom->funcs[n] (s) % bloom->asize, t); + } + + return TRUE; + +} + +gboolean +bloom_check (bloom_filter_t * bloom, const char *s) +{ + size_t n; + + for (n = 0; n < bloom->nfuncs; ++n) { + if (!(GETBIT (bloom->a, bloom->funcs[n] (s) % bloom->asize))) + return FALSE; + } + + return TRUE; +} diff --git a/src/bloom.h b/src/bloom.h new file mode 100644 index 000000000..e97b0aaee --- /dev/null +++ b/src/bloom.h @@ -0,0 +1,33 @@ +#ifndef __RSPAMD_BLOOM_H__ +#define __RSPAMD_BLOOM_H__ + +#include "config.h" + +typedef unsigned int (*hashfunc_t) (const char *); + +typedef struct bloom_filter_s { + size_t asize; + unsigned char *a; + size_t nfuncs; + hashfunc_t *funcs; +} bloom_filter_t; + +/* Hash functions */ +unsigned int bloom_sax_hash (const char *key); +unsigned int bloom_sdbm_hash (const char *key); +unsigned int bloom_fnv_hash (const char *key); +unsigned int bloom_rs_hash (const char *key); +unsigned int bloom_js_hash (const char *key); +unsigned int bloom_elf_hash (const char *key); +unsigned int bloom_bkdr_hash (const char *key); +unsigned int bloom_ap_hash (const char *key); + +#define DEFAULT_BLOOM_HASHES 8, bloom_sax_hash, bloom_sdbm_hash, bloom_fnv_hash, bloom_rs_hash, bloom_js_hash, bloom_elf_hash, bloom_bkdr_hash, bloom_ap_hash + +bloom_filter_t* bloom_create (size_t size, size_t nfuncs, ...); +void bloom_destroy (bloom_filter_t * bloom); +gboolean bloom_add (bloom_filter_t * bloom, const char *s); +gboolean bloom_del (bloom_filter_t * bloom, const char *s); +gboolean bloom_check (bloom_filter_t * bloom, const char *s); + +#endif