From ae6c8d55b14744b622146be1c89c5cd7163ec183 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 18 Feb 2014 18:39:07 +0000 Subject: [PATCH] Rework bloom hash library to use XXHash. Conflicts: src/fuzzy_storage.c --- src/bloom.c | 157 +++++++++----------------------------------- src/bloom.h | 33 ++++------ src/fuzzy_storage.c | 40 ++++++++--- 3 files changed, 77 insertions(+), 153 deletions(-) diff --git a/src/bloom.c b/src/bloom.c index d74d395c9..f857d2e49 100644 --- a/src/bloom.c +++ b/src/bloom.c @@ -24,6 +24,7 @@ #include "config.h" #include "bloom.h" +#include "xxhash.h" /* 4 bits are used for counting (implementing delete operation) */ #define SIZE_BIT 4 @@ -50,130 +51,23 @@ #define GETBIT(a, n) (a[n * SIZE_BIT / CHAR_BIT] & (0xF << (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT))) /* Common hash functions */ -guint -bloom_sax_hash (const gchar *key) -{ - guint h = 0; - - while (*key) - h ^= (h << 5) + (h >> 2) + (gchar)*key++; - - return h; -} - -guint -bloom_sdbm_hash (const gchar *key) -{ - guint h = 0; - - while (*key) - h = (gchar)*key++ + (h << 6) + (h << 16) - h; - - return h; -} - -guint -bloom_fnv_hash (const gchar *key) -{ - guint h = 0; - - while (*key) { - h ^= (gchar)*key++; - h += (h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24); - } - - return h; -} - -guint -bloom_rs_hash (const gchar *key) -{ - guint b = 378551; - guint a = 63689; - guint hash = 0; - - while (*key) { - hash = hash * a + (gchar)*key++; - a = a * b; - } - - return hash; -} - -guint -bloom_js_hash (const gchar *key) -{ - guint hash = 1315423911; - - while (*key) { - hash ^= ((hash << 5) + (gchar)*key++ + (hash >> 2)); - } - - return hash; -} - - -guint -bloom_elf_hash (const gchar *key) -{ - guint hash = 0; - guint x = 0; - - while (*key) { - hash = (hash << 4) + (gchar)*key++; - if ((x = hash & 0xF0000000L) != 0) { - hash ^= (x >> 24); - } - hash &= ~x; - } - - return hash; -} - - -guint -bloom_bkdr_hash (const gchar *key) -{ - guint seed = 131; /* 31 131 1313 13131 131313 etc.. */ - guint hash = 0; - - while (*key) { - hash = (hash * seed) + (gchar)*key++; - } - - return hash; -} -guint -bloom_ap_hash (const gchar *key) +rspamd_bloom_filter_t * +rspamd_bloom_create (size_t size, size_t nfuncs, ...) { - guint hash = 0xAAAAAAAA; - guint i = 0; - - while (*key) { - hash ^= ((i & 1) == 0) ? ((hash << 7) ^ ((gchar)*key) * (hash >> 3)) : (~((hash << 11) + (((gchar)*key) ^ (hash >> 5)))); - key++; - } - - return hash; -} - -bloom_filter_t * -bloom_create (size_t size, size_t nfuncs, ...) -{ - bloom_filter_t *bloom; + rspamd_bloom_filter_t *bloom; va_list l; gsize n; - if (!(bloom = g_malloc (sizeof (bloom_filter_t)))) { + if (!(bloom = g_malloc (sizeof (rspamd_bloom_filter_t)))) { return NULL; } if (!(bloom->a = g_new0 (gchar, (size + CHAR_BIT - 1) / CHAR_BIT * SIZE_BIT))) { g_free (bloom); return NULL; } - if (!(bloom->funcs = (hashfunc_t *) g_malloc (nfuncs * sizeof (hashfunc_t)))) { + if (!(bloom->seeds = g_new0 (guint32, nfuncs))) { g_free (bloom->a); g_free (bloom); return NULL; @@ -181,7 +75,7 @@ bloom_create (size_t size, size_t nfuncs, ...) va_start (l, nfuncs); for (n = 0; n < nfuncs; ++n) { - bloom->funcs[n] = va_arg (l, hashfunc_t); + bloom->seeds[n] = va_arg (l, guint32); } va_end (l); @@ -192,22 +86,26 @@ bloom_create (size_t size, size_t nfuncs, ...) } void -bloom_destroy (bloom_filter_t * bloom) +rspamd_bloom_destroy (rspamd_bloom_filter_t * bloom) { g_free (bloom->a); - g_free (bloom->funcs); + g_free (bloom->seeds); g_free (bloom); } gboolean -bloom_add (bloom_filter_t * bloom, const gchar *s) +rspamd_bloom_add (rspamd_bloom_filter_t * bloom, const gchar *s) { - size_t n; + size_t n, len; u_char t; guint v; + if (s == NULL) { + return FALSE; + } + len = strlen (s); for (n = 0; n < bloom->nfuncs; ++n) { - v = bloom->funcs[n] (s) % bloom->asize; + v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize; INCBIT (bloom->a, v, t); } @@ -215,14 +113,18 @@ bloom_add (bloom_filter_t * bloom, const gchar *s) } gboolean -bloom_del (bloom_filter_t * bloom, const gchar *s) +rspamd_bloom_del (rspamd_bloom_filter_t * bloom, const gchar *s) { - size_t n; + size_t n, len; u_char t; guint v; + if (s == NULL) { + return FALSE; + } + len = strlen (s); for (n = 0; n < bloom->nfuncs; ++n) { - v = bloom->funcs[n] (s) % bloom->asize; + v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize; DECBIT (bloom->a, v, t); } @@ -231,15 +133,20 @@ bloom_del (bloom_filter_t * bloom, const gchar *s) } gboolean -bloom_check (bloom_filter_t * bloom, const gchar *s) +rspamd_bloom_check (rspamd_bloom_filter_t * bloom, const gchar *s) { - size_t n; + size_t n, len; guint v; + if (s == NULL) { + return FALSE; + } + len = strlen (s); for (n = 0; n < bloom->nfuncs; ++n) { - v = bloom->funcs[n] (s) % bloom->asize; - if (!(GETBIT (bloom->a, v))) + v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize; + if (!(GETBIT (bloom->a, v))) { return FALSE; + } } return TRUE; diff --git a/src/bloom.h b/src/bloom.h index eb3d538ba..380143c80 100644 --- a/src/bloom.h +++ b/src/bloom.h @@ -3,26 +3,19 @@ #include "config.h" -typedef guint (*hashfunc_t) (const gchar *); - -typedef struct bloom_filter_s { +typedef struct rspamd_bloom_filter_s { size_t asize; gchar *a; size_t nfuncs; - hashfunc_t *funcs; -} bloom_filter_t; + guint32 *seeds; +} rspamd_bloom_filter_t; -/* Hash functions */ -guint bloom_sax_hash (const gchar *key); -guint bloom_sdbm_hash (const gchar *key); -guint bloom_fnv_hash (const gchar *key); -guint bloom_rs_hash (const gchar *key); -guint bloom_js_hash (const gchar *key); -guint bloom_elf_hash (const gchar *key); -guint bloom_bkdr_hash (const gchar *key); -guint bloom_ap_hash (const gchar *key); -#define DEFAULT_BLOOM_HASHES 8, bloom_sax_hash, bloom_sdbm_hash, bloom_fnv_hash, bloom_rs_hash, bloom_js_hash, bloom_elf_hash, bloom_bkdr_hash, bloom_ap_hash +/* + * Some random uint32 seeds for hashing + */ +#define RSPAMD_DEFAULT_BLOOM_HASHES 8, 0x61782caaU, 0x79ab8141U, 0xe45ee2d1U, \ + 0xf97542d1U, 0x1e2623edU, 0xf5a23cfeU, 0xa41b2508U, 0x85abdce8U /* * Create new bloom filter @@ -30,26 +23,26 @@ guint bloom_ap_hash (const gchar *key); * @param nfuncs number of hash functions * @param ... hash functions list */ -bloom_filter_t* bloom_create (size_t size, size_t nfuncs, ...); +rspamd_bloom_filter_t* rspamd_bloom_create (size_t size, size_t nfuncs, ...); /* * Destroy bloom filter */ -void bloom_destroy (bloom_filter_t * bloom); +void rspamd_bloom_destroy (rspamd_bloom_filter_t * bloom); /* * Add a string to bloom filter */ -gboolean bloom_add (bloom_filter_t * bloom, const gchar *s); +gboolean rspamd_bloom_add (rspamd_bloom_filter_t * bloom, const gchar *s); /* * Delete a string from bloom filter */ -gboolean bloom_del (bloom_filter_t * bloom, const gchar *s); +gboolean rspamd_bloom_del (rspamd_bloom_filter_t * bloom, const gchar *s); /* * Check whether this string is in bloom filter (algorithm produces FALSE-POSITIVES, so result must be checked if it is positive) */ -gboolean bloom_check (bloom_filter_t * bloom, const gchar *s); +gboolean rspamd_bloom_check (rspamd_bloom_filter_t * bloom, const gchar *s); #endif diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c index 1c8d97cd3..9b97cda18 100644 --- a/src/fuzzy_storage.c +++ b/src/fuzzy_storage.c @@ -79,7 +79,7 @@ worker_t fuzzy_worker = { static GQueue *hashes[BUCKETS]; static GQueue *frequent; static GHashTable *static_hash; -static rspamd_bloom_filter_t *bf; +static bloom_filter_t *bf; /* Number of cache modifications */ static guint32 mods = 0; @@ -184,7 +184,7 @@ expire_nodes (gpointer *to_expire, gint expired_num, } server_stat->fuzzy_hashes --; g_hash_table_remove (static_hash, node->h.hash_pipe); - rspamd_bloom_del (bf, node->h.hash_pipe); + bloom_del (bf, node->h.hash_pipe); g_slice_free1 (sizeof (struct rspamd_fuzzy_node), node); } else { @@ -192,7 +192,7 @@ expire_nodes (gpointer *to_expire, gint expired_num, node = (struct rspamd_fuzzy_node *)cur->data; head = hashes[node->h.block_size % BUCKETS]; g_queue_delete_link (head, cur); - rspamd_bloom_del (bf, node->h.hash_pipe); + bloom_del (bf, node->h.hash_pipe); if (node->time != INVALID_NODE_TIME) { server_stat->fuzzy_hashes_expired ++; } @@ -492,7 +492,7 @@ read_hashes_file (struct rspamd_worker *wrk) g_queue_push_head (hashes[node->h.block_size % BUCKETS], node); } } - rspamd_bloom_add (bf, node->h.hash_pipe); + bloom_add (bf, node->h.hash_pipe); if (touch_stat) { server_stat->fuzzy_hashes ++; } @@ -610,7 +610,7 @@ process_check_command (struct fuzzy_cmd *cmd, gint *flag, guint64 time, struct r struct rspamd_fuzzy_node *h; - if (!rspamd_bloom_check (bf, cmd->hash)) { + if (!bloom_check (bf, cmd->hash)) { return 0; } @@ -725,7 +725,7 @@ delete_hash (GQueue *hash, fuzzy_hash_t *s, struct rspamd_fuzzy_storage_ctx *ctx tmp = cur; cur = g_list_next (cur); g_queue_delete_link (hash, tmp); - rspamd_bloom_del (bf, s->hash_pipe); + bloom_del (bf, s->hash_pipe); msg_info ("fuzzy hash was successfully deleted"); server_stat->fuzzy_hashes --; mods++; @@ -747,7 +747,7 @@ process_delete_command (struct fuzzy_cmd *cmd, guint64 time, struct rspamd_fuzzy fuzzy_hash_t s; gboolean res = FALSE; - if (!rspamd_bloom_check (bf, cmd->hash)) { + if (!bloom_check (bf, cmd->hash)) { return FALSE; } @@ -937,6 +937,30 @@ sync_callback (gint fd, short what, void *arg) rspamd_mutex_unlock (ctx->update_mtx); } +static gboolean +parse_fuzzy_update_list (struct rspamd_fuzzy_storage_ctx *ctx) +{ + gchar **strvec, **cur; + struct in_addr ina; + guint32 mask; + + strvec = g_strsplit_set (ctx->update_map, ",", 0); + cur = strvec; + + while (*cur != NULL) { + /* XXX: handle only ipv4 addresses */ + if (parse_ipmask_v4 (*cur, &ina, &mask)) { + if (ctx->update_ips == NULL) { + ctx->update_ips = radix_tree_create (); + } + radix32tree_add (ctx->update_ips, htonl (ina.s_addr), mask, 1); + } + cur ++; + } + + return (ctx->update_ips != NULL); +} + gpointer init_fuzzy (struct config_file *cfg) { @@ -1047,7 +1071,7 @@ start_fuzzy (struct rspamd_worker *worker) if (ctx->update_map != NULL) { if (!add_map (worker->srv->cfg, ctx->update_map, "Allow fuzzy updates from specified addresses", read_radix_list, fin_radix_list, (void **)&ctx->update_ips)) { - if (!rspamd_parse_ip_list (ctx->update_map, &ctx->update_ips)) { + if (!parse_fuzzy_update_list (ctx)) { msg_warn ("cannot load or parse ip list from '%s'", ctx->update_map); } } -- 2.39.5