diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-02-18 18:39:07 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-02-18 18:39:07 +0000 |
commit | 20b70f7ade43a74b4790e503ec1dff33798f2cb0 (patch) | |
tree | e3795769d1297b4d7d98e661eed93c8f72dd0a14 | |
parent | 29962c678ab36050ffce1b2849d49eaa54aef921 (diff) | |
download | rspamd-20b70f7ade43a74b4790e503ec1dff33798f2cb0.tar.gz rspamd-20b70f7ade43a74b4790e503ec1dff33798f2cb0.zip |
Rework bloom hash library to use XXHash.
-rw-r--r-- | src/bloom.c | 157 | ||||
-rw-r--r-- | src/bloom.h | 33 | ||||
-rw-r--r-- | src/fuzzy_storage.c | 24 |
3 files changed, 58 insertions, 156 deletions
diff --git a/src/bloom.c b/src/bloom.c index d74d395c9..f857d2e49 100644 --- a/src/bloom.c +++ b/src/bloom.c @@ -24,6 +24,7 @@ #include "config.h" #include "bloom.h" +#include "xxhash.h" /* 4 bits are used for counting (implementing delete operation) */ #define SIZE_BIT 4 @@ -50,130 +51,23 @@ #define GETBIT(a, n) (a[n * SIZE_BIT / CHAR_BIT] & (0xF << (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT))) /* Common hash functions */ -guint -bloom_sax_hash (const gchar *key) -{ - guint h = 0; - - while (*key) - h ^= (h << 5) + (h >> 2) + (gchar)*key++; - - return h; -} - -guint -bloom_sdbm_hash (const gchar *key) -{ - guint h = 0; - - while (*key) - h = (gchar)*key++ + (h << 6) + (h << 16) - h; - - return h; -} - -guint -bloom_fnv_hash (const gchar *key) -{ - guint h = 0; - - while (*key) { - h ^= (gchar)*key++; - h += (h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24); - } - - return h; -} - -guint -bloom_rs_hash (const gchar *key) -{ - guint b = 378551; - guint a = 63689; - guint hash = 0; - - while (*key) { - hash = hash * a + (gchar)*key++; - a = a * b; - } - - return hash; -} - -guint -bloom_js_hash (const gchar *key) -{ - guint hash = 1315423911; - - while (*key) { - hash ^= ((hash << 5) + (gchar)*key++ + (hash >> 2)); - } - - return hash; -} - - -guint -bloom_elf_hash (const gchar *key) -{ - guint hash = 0; - guint x = 0; - - while (*key) { - hash = (hash << 4) + (gchar)*key++; - if ((x = hash & 0xF0000000L) != 0) { - hash ^= (x >> 24); - } - hash &= ~x; - } - - return hash; -} - - -guint -bloom_bkdr_hash (const gchar *key) -{ - guint seed = 131; /* 31 131 1313 13131 131313 etc.. */ - guint hash = 0; - - while (*key) { - hash = (hash * seed) + (gchar)*key++; - } - - return hash; -} -guint -bloom_ap_hash (const gchar *key) +rspamd_bloom_filter_t * +rspamd_bloom_create (size_t size, size_t nfuncs, ...) { - guint hash = 0xAAAAAAAA; - guint i = 0; - - while (*key) { - hash ^= ((i & 1) == 0) ? ((hash << 7) ^ ((gchar)*key) * (hash >> 3)) : (~((hash << 11) + (((gchar)*key) ^ (hash >> 5)))); - key++; - } - - return hash; -} - -bloom_filter_t * -bloom_create (size_t size, size_t nfuncs, ...) -{ - bloom_filter_t *bloom; + rspamd_bloom_filter_t *bloom; va_list l; gsize n; - if (!(bloom = g_malloc (sizeof (bloom_filter_t)))) { + if (!(bloom = g_malloc (sizeof (rspamd_bloom_filter_t)))) { return NULL; } if (!(bloom->a = g_new0 (gchar, (size + CHAR_BIT - 1) / CHAR_BIT * SIZE_BIT))) { g_free (bloom); return NULL; } - if (!(bloom->funcs = (hashfunc_t *) g_malloc (nfuncs * sizeof (hashfunc_t)))) { + if (!(bloom->seeds = g_new0 (guint32, nfuncs))) { g_free (bloom->a); g_free (bloom); return NULL; @@ -181,7 +75,7 @@ bloom_create (size_t size, size_t nfuncs, ...) va_start (l, nfuncs); for (n = 0; n < nfuncs; ++n) { - bloom->funcs[n] = va_arg (l, hashfunc_t); + bloom->seeds[n] = va_arg (l, guint32); } va_end (l); @@ -192,22 +86,26 @@ bloom_create (size_t size, size_t nfuncs, ...) } void -bloom_destroy (bloom_filter_t * bloom) +rspamd_bloom_destroy (rspamd_bloom_filter_t * bloom) { g_free (bloom->a); - g_free (bloom->funcs); + g_free (bloom->seeds); g_free (bloom); } gboolean -bloom_add (bloom_filter_t * bloom, const gchar *s) +rspamd_bloom_add (rspamd_bloom_filter_t * bloom, const gchar *s) { - size_t n; + size_t n, len; u_char t; guint v; + if (s == NULL) { + return FALSE; + } + len = strlen (s); for (n = 0; n < bloom->nfuncs; ++n) { - v = bloom->funcs[n] (s) % bloom->asize; + v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize; INCBIT (bloom->a, v, t); } @@ -215,14 +113,18 @@ bloom_add (bloom_filter_t * bloom, const gchar *s) } gboolean -bloom_del (bloom_filter_t * bloom, const gchar *s) +rspamd_bloom_del (rspamd_bloom_filter_t * bloom, const gchar *s) { - size_t n; + size_t n, len; u_char t; guint v; + if (s == NULL) { + return FALSE; + } + len = strlen (s); for (n = 0; n < bloom->nfuncs; ++n) { - v = bloom->funcs[n] (s) % bloom->asize; + v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize; DECBIT (bloom->a, v, t); } @@ -231,15 +133,20 @@ bloom_del (bloom_filter_t * bloom, const gchar *s) } gboolean -bloom_check (bloom_filter_t * bloom, const gchar *s) +rspamd_bloom_check (rspamd_bloom_filter_t * bloom, const gchar *s) { - size_t n; + size_t n, len; guint v; + if (s == NULL) { + return FALSE; + } + len = strlen (s); for (n = 0; n < bloom->nfuncs; ++n) { - v = bloom->funcs[n] (s) % bloom->asize; - if (!(GETBIT (bloom->a, v))) + v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize; + if (!(GETBIT (bloom->a, v))) { return FALSE; + } } return TRUE; diff --git a/src/bloom.h b/src/bloom.h index eb3d538ba..380143c80 100644 --- a/src/bloom.h +++ b/src/bloom.h @@ -3,26 +3,19 @@ #include "config.h" -typedef guint (*hashfunc_t) (const gchar *); - -typedef struct bloom_filter_s { +typedef struct rspamd_bloom_filter_s { size_t asize; gchar *a; size_t nfuncs; - hashfunc_t *funcs; -} bloom_filter_t; + guint32 *seeds; +} rspamd_bloom_filter_t; -/* Hash functions */ -guint bloom_sax_hash (const gchar *key); -guint bloom_sdbm_hash (const gchar *key); -guint bloom_fnv_hash (const gchar *key); -guint bloom_rs_hash (const gchar *key); -guint bloom_js_hash (const gchar *key); -guint bloom_elf_hash (const gchar *key); -guint bloom_bkdr_hash (const gchar *key); -guint bloom_ap_hash (const gchar *key); -#define DEFAULT_BLOOM_HASHES 8, bloom_sax_hash, bloom_sdbm_hash, bloom_fnv_hash, bloom_rs_hash, bloom_js_hash, bloom_elf_hash, bloom_bkdr_hash, bloom_ap_hash +/* + * Some random uint32 seeds for hashing + */ +#define RSPAMD_DEFAULT_BLOOM_HASHES 8, 0x61782caaU, 0x79ab8141U, 0xe45ee2d1U, \ + 0xf97542d1U, 0x1e2623edU, 0xf5a23cfeU, 0xa41b2508U, 0x85abdce8U /* * Create new bloom filter @@ -30,26 +23,26 @@ guint bloom_ap_hash (const gchar *key); * @param nfuncs number of hash functions * @param ... hash functions list */ -bloom_filter_t* bloom_create (size_t size, size_t nfuncs, ...); +rspamd_bloom_filter_t* rspamd_bloom_create (size_t size, size_t nfuncs, ...); /* * Destroy bloom filter */ -void bloom_destroy (bloom_filter_t * bloom); +void rspamd_bloom_destroy (rspamd_bloom_filter_t * bloom); /* * Add a string to bloom filter */ -gboolean bloom_add (bloom_filter_t * bloom, const gchar *s); +gboolean rspamd_bloom_add (rspamd_bloom_filter_t * bloom, const gchar *s); /* * Delete a string from bloom filter */ -gboolean bloom_del (bloom_filter_t * bloom, const gchar *s); +gboolean rspamd_bloom_del (rspamd_bloom_filter_t * bloom, const gchar *s); /* * Check whether this string is in bloom filter (algorithm produces FALSE-POSITIVES, so result must be checked if it is positive) */ -gboolean bloom_check (bloom_filter_t * bloom, const gchar *s); +gboolean rspamd_bloom_check (rspamd_bloom_filter_t * bloom, const gchar *s); #endif diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c index 0c5ee82f6..ecf4267ad 100644 --- a/src/fuzzy_storage.c +++ b/src/fuzzy_storage.c @@ -79,7 +79,7 @@ worker_t fuzzy_worker = { static GQueue *hashes[BUCKETS]; static GQueue *frequent; static GHashTable *static_hash; -static bloom_filter_t *bf; +static rspamd_bloom_filter_t *bf; /* Number of cache modifications */ static guint32 mods = 0; @@ -184,7 +184,7 @@ expire_nodes (gpointer *to_expire, gint expired_num, } server_stat->fuzzy_hashes --; g_hash_table_remove (static_hash, node->h.hash_pipe); - bloom_del (bf, node->h.hash_pipe); + rspamd_bloom_del (bf, node->h.hash_pipe); g_slice_free1 (sizeof (struct rspamd_fuzzy_node), node); } else { @@ -192,7 +192,7 @@ expire_nodes (gpointer *to_expire, gint expired_num, node = (struct rspamd_fuzzy_node *)cur->data; head = hashes[node->h.block_size % BUCKETS]; g_queue_delete_link (head, cur); - bloom_del (bf, node->h.hash_pipe); + rspamd_bloom_del (bf, node->h.hash_pipe); if (node->time != INVALID_NODE_TIME) { server_stat->fuzzy_hashes_expired ++; } @@ -492,7 +492,7 @@ read_hashes_file (struct rspamd_worker *wrk) g_queue_push_head (hashes[node->h.block_size % BUCKETS], node); } } - bloom_add (bf, node->h.hash_pipe); + rspamd_bloom_add (bf, node->h.hash_pipe); if (touch_stat) { server_stat->fuzzy_hashes ++; } @@ -610,7 +610,7 @@ process_check_command (struct fuzzy_cmd *cmd, gint *flag, guint64 time, struct r struct rspamd_fuzzy_node *h; - if (!bloom_check (bf, cmd->hash)) { + if (!rspamd_bloom_check (bf, cmd->hash)) { return 0; } @@ -666,7 +666,7 @@ process_write_command (struct fuzzy_cmd *cmd, guint64 time, struct rspamd_fuzzy_ { struct rspamd_fuzzy_node *h; - if (bloom_check (bf, cmd->hash)) { + if (rspamd_bloom_check (bf, cmd->hash)) { if (update_hash (cmd, time, ctx)) { return TRUE; } @@ -685,8 +685,10 @@ process_write_command (struct fuzzy_cmd *cmd, guint64 time, struct rspamd_fuzzy_ else { g_queue_push_head (hashes[cmd->blocksize % BUCKETS], h); } + rspamd_bloom_add (bf, cmd->hash); + rspamd_rwlock_writer_unlock (ctx->tree_lock); - bloom_add (bf, cmd->hash); + mods++; server_stat->fuzzy_hashes ++; msg_info ("fuzzy hash was successfully added"); @@ -704,7 +706,7 @@ delete_hash (GQueue *hash, fuzzy_hash_t *s, struct rspamd_fuzzy_storage_ctx *ctx if (ctx->strict_hash) { rspamd_rwlock_writer_lock (ctx->tree_lock); if (g_hash_table_remove (static_hash, s->hash_pipe)) { - bloom_del (bf, s->hash_pipe); + rspamd_bloom_del (bf, s->hash_pipe); msg_info ("fuzzy hash was successfully deleted"); server_stat->fuzzy_hashes --; mods++; @@ -723,7 +725,7 @@ delete_hash (GQueue *hash, fuzzy_hash_t *s, struct rspamd_fuzzy_storage_ctx *ctx tmp = cur; cur = g_list_next (cur); g_queue_delete_link (hash, tmp); - bloom_del (bf, s->hash_pipe); + rspamd_bloom_del (bf, s->hash_pipe); msg_info ("fuzzy hash was successfully deleted"); server_stat->fuzzy_hashes --; mods++; @@ -745,7 +747,7 @@ process_delete_command (struct fuzzy_cmd *cmd, guint64 time, struct rspamd_fuzzy fuzzy_hash_t s; gboolean res = FALSE; - if (!bloom_check (bf, cmd->hash)) { + if (!rspamd_bloom_check (bf, cmd->hash)) { return FALSE; } @@ -1039,7 +1041,7 @@ start_fuzzy (struct rspamd_worker *worker) signal_add (&sev, NULL); /* Init bloom filter */ - bf = bloom_create (20000000L, DEFAULT_BLOOM_HASHES); + bf = rspamd_bloom_create (2000000L, RSPAMD_DEFAULT_BLOOM_HASHES); /* Try to read hashes from file */ if (!read_hashes_file (worker)) { msg_err ("cannot read hashes file, it can be created after save procedure"); |