summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-02-18 18:39:07 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-02-18 18:39:07 +0000
commit20b70f7ade43a74b4790e503ec1dff33798f2cb0 (patch)
treee3795769d1297b4d7d98e661eed93c8f72dd0a14
parent29962c678ab36050ffce1b2849d49eaa54aef921 (diff)
downloadrspamd-20b70f7ade43a74b4790e503ec1dff33798f2cb0.tar.gz
rspamd-20b70f7ade43a74b4790e503ec1dff33798f2cb0.zip
Rework bloom hash library to use XXHash.
-rw-r--r--src/bloom.c157
-rw-r--r--src/bloom.h33
-rw-r--r--src/fuzzy_storage.c24
3 files changed, 58 insertions, 156 deletions
diff --git a/src/bloom.c b/src/bloom.c
index d74d395c9..f857d2e49 100644
--- a/src/bloom.c
+++ b/src/bloom.c
@@ -24,6 +24,7 @@
#include "config.h"
#include "bloom.h"
+#include "xxhash.h"
/* 4 bits are used for counting (implementing delete operation) */
#define SIZE_BIT 4
@@ -50,130 +51,23 @@
#define GETBIT(a, n) (a[n * SIZE_BIT / CHAR_BIT] & (0xF << (n % (CHAR_BIT/SIZE_BIT) * SIZE_BIT)))
/* Common hash functions */
-guint
-bloom_sax_hash (const gchar *key)
-{
- guint h = 0;
-
- while (*key)
- h ^= (h << 5) + (h >> 2) + (gchar)*key++;
-
- return h;
-}
-
-guint
-bloom_sdbm_hash (const gchar *key)
-{
- guint h = 0;
-
- while (*key)
- h = (gchar)*key++ + (h << 6) + (h << 16) - h;
-
- return h;
-}
-
-guint
-bloom_fnv_hash (const gchar *key)
-{
- guint h = 0;
-
- while (*key) {
- h ^= (gchar)*key++;
- h += (h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24);
- }
-
- return h;
-}
-
-guint
-bloom_rs_hash (const gchar *key)
-{
- guint b = 378551;
- guint a = 63689;
- guint hash = 0;
-
- while (*key) {
- hash = hash * a + (gchar)*key++;
- a = a * b;
- }
-
- return hash;
-}
-
-guint
-bloom_js_hash (const gchar *key)
-{
- guint hash = 1315423911;
-
- while (*key) {
- hash ^= ((hash << 5) + (gchar)*key++ + (hash >> 2));
- }
-
- return hash;
-}
-
-
-guint
-bloom_elf_hash (const gchar *key)
-{
- guint hash = 0;
- guint x = 0;
-
- while (*key) {
- hash = (hash << 4) + (gchar)*key++;
- if ((x = hash & 0xF0000000L) != 0) {
- hash ^= (x >> 24);
- }
- hash &= ~x;
- }
-
- return hash;
-}
-
-
-guint
-bloom_bkdr_hash (const gchar *key)
-{
- guint seed = 131; /* 31 131 1313 13131 131313 etc.. */
- guint hash = 0;
-
- while (*key) {
- hash = (hash * seed) + (gchar)*key++;
- }
-
- return hash;
-}
-guint
-bloom_ap_hash (const gchar *key)
+rspamd_bloom_filter_t *
+rspamd_bloom_create (size_t size, size_t nfuncs, ...)
{
- guint hash = 0xAAAAAAAA;
- guint i = 0;
-
- while (*key) {
- hash ^= ((i & 1) == 0) ? ((hash << 7) ^ ((gchar)*key) * (hash >> 3)) : (~((hash << 11) + (((gchar)*key) ^ (hash >> 5))));
- key++;
- }
-
- return hash;
-}
-
-bloom_filter_t *
-bloom_create (size_t size, size_t nfuncs, ...)
-{
- bloom_filter_t *bloom;
+ rspamd_bloom_filter_t *bloom;
va_list l;
gsize n;
- if (!(bloom = g_malloc (sizeof (bloom_filter_t)))) {
+ if (!(bloom = g_malloc (sizeof (rspamd_bloom_filter_t)))) {
return NULL;
}
if (!(bloom->a = g_new0 (gchar, (size + CHAR_BIT - 1) / CHAR_BIT * SIZE_BIT))) {
g_free (bloom);
return NULL;
}
- if (!(bloom->funcs = (hashfunc_t *) g_malloc (nfuncs * sizeof (hashfunc_t)))) {
+ if (!(bloom->seeds = g_new0 (guint32, nfuncs))) {
g_free (bloom->a);
g_free (bloom);
return NULL;
@@ -181,7 +75,7 @@ bloom_create (size_t size, size_t nfuncs, ...)
va_start (l, nfuncs);
for (n = 0; n < nfuncs; ++n) {
- bloom->funcs[n] = va_arg (l, hashfunc_t);
+ bloom->seeds[n] = va_arg (l, guint32);
}
va_end (l);
@@ -192,22 +86,26 @@ bloom_create (size_t size, size_t nfuncs, ...)
}
void
-bloom_destroy (bloom_filter_t * bloom)
+rspamd_bloom_destroy (rspamd_bloom_filter_t * bloom)
{
g_free (bloom->a);
- g_free (bloom->funcs);
+ g_free (bloom->seeds);
g_free (bloom);
}
gboolean
-bloom_add (bloom_filter_t * bloom, const gchar *s)
+rspamd_bloom_add (rspamd_bloom_filter_t * bloom, const gchar *s)
{
- size_t n;
+ size_t n, len;
u_char t;
guint v;
+ if (s == NULL) {
+ return FALSE;
+ }
+ len = strlen (s);
for (n = 0; n < bloom->nfuncs; ++n) {
- v = bloom->funcs[n] (s) % bloom->asize;
+ v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize;
INCBIT (bloom->a, v, t);
}
@@ -215,14 +113,18 @@ bloom_add (bloom_filter_t * bloom, const gchar *s)
}
gboolean
-bloom_del (bloom_filter_t * bloom, const gchar *s)
+rspamd_bloom_del (rspamd_bloom_filter_t * bloom, const gchar *s)
{
- size_t n;
+ size_t n, len;
u_char t;
guint v;
+ if (s == NULL) {
+ return FALSE;
+ }
+ len = strlen (s);
for (n = 0; n < bloom->nfuncs; ++n) {
- v = bloom->funcs[n] (s) % bloom->asize;
+ v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize;
DECBIT (bloom->a, v, t);
}
@@ -231,15 +133,20 @@ bloom_del (bloom_filter_t * bloom, const gchar *s)
}
gboolean
-bloom_check (bloom_filter_t * bloom, const gchar *s)
+rspamd_bloom_check (rspamd_bloom_filter_t * bloom, const gchar *s)
{
- size_t n;
+ size_t n, len;
guint v;
+ if (s == NULL) {
+ return FALSE;
+ }
+ len = strlen (s);
for (n = 0; n < bloom->nfuncs; ++n) {
- v = bloom->funcs[n] (s) % bloom->asize;
- if (!(GETBIT (bloom->a, v)))
+ v = XXH32 (s, len, bloom->seeds[n]) % bloom->asize;
+ if (!(GETBIT (bloom->a, v))) {
return FALSE;
+ }
}
return TRUE;
diff --git a/src/bloom.h b/src/bloom.h
index eb3d538ba..380143c80 100644
--- a/src/bloom.h
+++ b/src/bloom.h
@@ -3,26 +3,19 @@
#include "config.h"
-typedef guint (*hashfunc_t) (const gchar *);
-
-typedef struct bloom_filter_s {
+typedef struct rspamd_bloom_filter_s {
size_t asize;
gchar *a;
size_t nfuncs;
- hashfunc_t *funcs;
-} bloom_filter_t;
+ guint32 *seeds;
+} rspamd_bloom_filter_t;
-/* Hash functions */
-guint bloom_sax_hash (const gchar *key);
-guint bloom_sdbm_hash (const gchar *key);
-guint bloom_fnv_hash (const gchar *key);
-guint bloom_rs_hash (const gchar *key);
-guint bloom_js_hash (const gchar *key);
-guint bloom_elf_hash (const gchar *key);
-guint bloom_bkdr_hash (const gchar *key);
-guint bloom_ap_hash (const gchar *key);
-#define DEFAULT_BLOOM_HASHES 8, bloom_sax_hash, bloom_sdbm_hash, bloom_fnv_hash, bloom_rs_hash, bloom_js_hash, bloom_elf_hash, bloom_bkdr_hash, bloom_ap_hash
+/*
+ * Some random uint32 seeds for hashing
+ */
+#define RSPAMD_DEFAULT_BLOOM_HASHES 8, 0x61782caaU, 0x79ab8141U, 0xe45ee2d1U, \
+ 0xf97542d1U, 0x1e2623edU, 0xf5a23cfeU, 0xa41b2508U, 0x85abdce8U
/*
* Create new bloom filter
@@ -30,26 +23,26 @@ guint bloom_ap_hash (const gchar *key);
* @param nfuncs number of hash functions
* @param ... hash functions list
*/
-bloom_filter_t* bloom_create (size_t size, size_t nfuncs, ...);
+rspamd_bloom_filter_t* rspamd_bloom_create (size_t size, size_t nfuncs, ...);
/*
* Destroy bloom filter
*/
-void bloom_destroy (bloom_filter_t * bloom);
+void rspamd_bloom_destroy (rspamd_bloom_filter_t * bloom);
/*
* Add a string to bloom filter
*/
-gboolean bloom_add (bloom_filter_t * bloom, const gchar *s);
+gboolean rspamd_bloom_add (rspamd_bloom_filter_t * bloom, const gchar *s);
/*
* Delete a string from bloom filter
*/
-gboolean bloom_del (bloom_filter_t * bloom, const gchar *s);
+gboolean rspamd_bloom_del (rspamd_bloom_filter_t * bloom, const gchar *s);
/*
* Check whether this string is in bloom filter (algorithm produces FALSE-POSITIVES, so result must be checked if it is positive)
*/
-gboolean bloom_check (bloom_filter_t * bloom, const gchar *s);
+gboolean rspamd_bloom_check (rspamd_bloom_filter_t * bloom, const gchar *s);
#endif
diff --git a/src/fuzzy_storage.c b/src/fuzzy_storage.c
index 0c5ee82f6..ecf4267ad 100644
--- a/src/fuzzy_storage.c
+++ b/src/fuzzy_storage.c
@@ -79,7 +79,7 @@ worker_t fuzzy_worker = {
static GQueue *hashes[BUCKETS];
static GQueue *frequent;
static GHashTable *static_hash;
-static bloom_filter_t *bf;
+static rspamd_bloom_filter_t *bf;
/* Number of cache modifications */
static guint32 mods = 0;
@@ -184,7 +184,7 @@ expire_nodes (gpointer *to_expire, gint expired_num,
}
server_stat->fuzzy_hashes --;
g_hash_table_remove (static_hash, node->h.hash_pipe);
- bloom_del (bf, node->h.hash_pipe);
+ rspamd_bloom_del (bf, node->h.hash_pipe);
g_slice_free1 (sizeof (struct rspamd_fuzzy_node), node);
}
else {
@@ -192,7 +192,7 @@ expire_nodes (gpointer *to_expire, gint expired_num,
node = (struct rspamd_fuzzy_node *)cur->data;
head = hashes[node->h.block_size % BUCKETS];
g_queue_delete_link (head, cur);
- bloom_del (bf, node->h.hash_pipe);
+ rspamd_bloom_del (bf, node->h.hash_pipe);
if (node->time != INVALID_NODE_TIME) {
server_stat->fuzzy_hashes_expired ++;
}
@@ -492,7 +492,7 @@ read_hashes_file (struct rspamd_worker *wrk)
g_queue_push_head (hashes[node->h.block_size % BUCKETS], node);
}
}
- bloom_add (bf, node->h.hash_pipe);
+ rspamd_bloom_add (bf, node->h.hash_pipe);
if (touch_stat) {
server_stat->fuzzy_hashes ++;
}
@@ -610,7 +610,7 @@ process_check_command (struct fuzzy_cmd *cmd, gint *flag, guint64 time, struct r
struct rspamd_fuzzy_node *h;
- if (!bloom_check (bf, cmd->hash)) {
+ if (!rspamd_bloom_check (bf, cmd->hash)) {
return 0;
}
@@ -666,7 +666,7 @@ process_write_command (struct fuzzy_cmd *cmd, guint64 time, struct rspamd_fuzzy_
{
struct rspamd_fuzzy_node *h;
- if (bloom_check (bf, cmd->hash)) {
+ if (rspamd_bloom_check (bf, cmd->hash)) {
if (update_hash (cmd, time, ctx)) {
return TRUE;
}
@@ -685,8 +685,10 @@ process_write_command (struct fuzzy_cmd *cmd, guint64 time, struct rspamd_fuzzy_
else {
g_queue_push_head (hashes[cmd->blocksize % BUCKETS], h);
}
+ rspamd_bloom_add (bf, cmd->hash);
+
rspamd_rwlock_writer_unlock (ctx->tree_lock);
- bloom_add (bf, cmd->hash);
+
mods++;
server_stat->fuzzy_hashes ++;
msg_info ("fuzzy hash was successfully added");
@@ -704,7 +706,7 @@ delete_hash (GQueue *hash, fuzzy_hash_t *s, struct rspamd_fuzzy_storage_ctx *ctx
if (ctx->strict_hash) {
rspamd_rwlock_writer_lock (ctx->tree_lock);
if (g_hash_table_remove (static_hash, s->hash_pipe)) {
- bloom_del (bf, s->hash_pipe);
+ rspamd_bloom_del (bf, s->hash_pipe);
msg_info ("fuzzy hash was successfully deleted");
server_stat->fuzzy_hashes --;
mods++;
@@ -723,7 +725,7 @@ delete_hash (GQueue *hash, fuzzy_hash_t *s, struct rspamd_fuzzy_storage_ctx *ctx
tmp = cur;
cur = g_list_next (cur);
g_queue_delete_link (hash, tmp);
- bloom_del (bf, s->hash_pipe);
+ rspamd_bloom_del (bf, s->hash_pipe);
msg_info ("fuzzy hash was successfully deleted");
server_stat->fuzzy_hashes --;
mods++;
@@ -745,7 +747,7 @@ process_delete_command (struct fuzzy_cmd *cmd, guint64 time, struct rspamd_fuzzy
fuzzy_hash_t s;
gboolean res = FALSE;
- if (!bloom_check (bf, cmd->hash)) {
+ if (!rspamd_bloom_check (bf, cmd->hash)) {
return FALSE;
}
@@ -1039,7 +1041,7 @@ start_fuzzy (struct rspamd_worker *worker)
signal_add (&sev, NULL);
/* Init bloom filter */
- bf = bloom_create (20000000L, DEFAULT_BLOOM_HASHES);
+ bf = rspamd_bloom_create (2000000L, RSPAMD_DEFAULT_BLOOM_HASHES);
/* Try to read hashes from file */
if (!read_hashes_file (worker)) {
msg_err ("cannot read hashes file, it can be created after save procedure");