From 8e399cdba1bba1da8c1de2b8a22efe719aa30cde Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 8 Oct 2012 21:21:53 +0400 Subject: [PATCH] * Use murmur hash for all hashes as it is more efficient and provides more uniform distribution as glib's default one. * Fix probability renormalization while using advanced classification. --- src/cfg_utils.c | 28 +++--- src/cfg_xml.c | 12 +-- src/classifiers/bayes.c | 38 +++++-- src/client/rspamc.c | 14 +-- src/expressions.c | 4 +- src/filter.c | 2 +- src/lmtp.c | 2 +- src/lua_worker.c | 2 +- src/main.c | 4 +- src/mem_pool.c | 2 +- src/plugins/regexp.c | 2 +- src/settings.c | 12 +-- src/symbols_cache.c | 2 +- src/util.c | 218 +++++++++++++++++++++++++++++++++++++++- src/util.h | 42 ++++++++ src/worker_util.c | 4 +- 16 files changed, 329 insertions(+), 59 deletions(-) diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 7dfe54462..37f28c822 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -215,14 +215,14 @@ init_defaults (struct config_file *cfg) cfg->max_diff = 20480; cfg->max_statfile_size = DEFAULT_STATFILE_SIZE; - cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal); - cfg->variables = g_hash_table_new (g_str_hash, g_str_equal); - cfg->metrics = g_hash_table_new (g_str_hash, g_str_equal); - cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal); - cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal); - cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal); - cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal); - cfg->metrics_symbols = g_hash_table_new (g_str_hash, g_str_equal); + cfg->modules_opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + cfg->variables = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + cfg->metrics = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + cfg->c_modules = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + cfg->composite_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + cfg->classifiers_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + cfg->cfg_params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + cfg->metrics_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); cfg->map_timeout = DEFAULT_MAP_TIMEOUT; @@ -825,11 +825,11 @@ check_classifier_conf (struct config_file *cfg, struct classifier_config *c) c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config)); } if (c->opts == NULL) { - c->opts = g_hash_table_new (g_str_hash, g_str_equal); + c->opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->opts); } if (c->labels == NULL) { - c->labels = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, (GDestroyNotify)g_list_free); + c->labels = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, NULL, (GDestroyNotify)g_list_free); memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->labels); } @@ -843,7 +843,7 @@ check_statfile_conf (struct config_file *cfg, struct statfile *c) c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile)); } if (c->opts == NULL) { - c->opts = g_hash_table_new (g_str_hash, g_str_equal); + c->opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->opts); } @@ -857,8 +857,8 @@ check_metric_conf (struct config_file *cfg, struct metric *c) c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct metric)); c->action = METRIC_ACTION_REJECT; c->grow_factor = 1.0; - c->symbols = g_hash_table_new (g_str_hash, g_str_equal); - c->descriptions = g_hash_table_new (g_str_hash, g_str_equal); + c->symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); + c->descriptions = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->symbols); memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->descriptions); } @@ -871,7 +871,7 @@ check_worker_conf (struct config_file *cfg, struct worker_conf *c) { if (c == NULL) { c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct worker_conf)); - c->params = g_hash_table_new (g_str_hash, g_str_equal); + c->params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); c->active_workers = g_queue_new (); memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->params); memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_queue_free, c->active_workers); diff --git a/src/cfg_xml.c b/src/cfg_xml.c index 99aad12ab..277172f2b 100644 --- a/src/cfg_xml.c +++ b/src/cfg_xml.c @@ -2310,10 +2310,10 @@ register_module_opt (const gchar *mname, const gchar *optname, enum module_opt_t GHashTable *module; if (module_options == NULL) { - module_options = g_hash_table_new (g_str_hash, g_str_equal); + module_options = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); } if ((module = g_hash_table_lookup (module_options, mname)) == NULL) { - module = g_hash_table_new (g_str_hash, g_str_equal); + module = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); g_hash_table_insert (module_options, (char *)mname, module); } if ((param = g_hash_table_lookup (module, optname)) == NULL) { @@ -2348,7 +2348,7 @@ register_worker_opt (gint wtype, const gchar *optname, element_handler_func func worker_options = g_hash_table_new (g_int_hash, g_int_equal); } if ((worker = g_hash_table_lookup (worker_options, &wtype)) == NULL) { - worker = g_hash_table_new (g_str_hash, g_str_equal); + worker = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); new_key = g_malloc (sizeof (gint)); *new_key = wtype; g_hash_table_insert (worker_options, new_key, worker); @@ -2372,10 +2372,10 @@ register_classifier_opt (const gchar *ctype, const gchar *optname) GHashTable *classifier; if (classifier_options == NULL) { - classifier_options = g_hash_table_new (g_str_hash, g_str_equal); + classifier_options = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); } if ((classifier = g_hash_table_lookup (classifier_options, ctype)) == NULL) { - classifier = g_hash_table_new (g_str_hash, g_str_equal); + classifier = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); g_hash_table_insert (classifier_options, (char *)ctype, classifier); } if ((param = g_hash_table_lookup (classifier, optname)) == NULL) { @@ -2406,7 +2406,7 @@ register_subparser (const gchar *tag, enum xml_read_state state, const GMarkupPa struct xml_subparser *subparser; if (subparsers == NULL) { - subparsers = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + subparsers = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free); } subparser = g_malloc (sizeof (struct xml_subparser)); diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index a80bbe0ba..0e68f5d72 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -182,10 +182,10 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, { struct bayes_callback_data data; gchar *value; - gint nodes, i = 0, cnt, best_num = 0; + gint nodes, i = 0, cnt, best_num_spam = 0, best_num_ham = 0; gint minnodes; guint64 rev, total_learns = 0; - double best = 0; + double best_spam = 0., best_ham = 0., total_spam = 0., total_ham = 0.; struct statfile *st; stat_file_t *file; GList *cur; @@ -262,17 +262,37 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, for (i = 0; i < cnt; i ++) { debug_task ("got probability for symbol %s: %.2f", data.statfiles[i].st->symbol, data.statfiles[i].post_probability); - if (data.statfiles[i].post_probability > best) { - best = data.statfiles[i].post_probability; - best_num = i; + + if (data.statfiles[i].st->is_spam) { + total_spam += data.statfiles[i].post_probability; + if (data.statfiles[i].post_probability > best_spam) { + best_spam = data.statfiles[i].post_probability; + best_num_spam = i; + } + } + else { + total_ham += data.statfiles[i].post_probability; + if (data.statfiles[i].post_probability > best_ham) { + best_ham = data.statfiles[i].post_probability; + best_num_ham = i; + } } } - if (best > 0.5) { + + if (total_ham > 0.5 || total_spam > 0.5) { + sumbuf = memory_pool_alloc (task->task_pool, 32); - rspamd_snprintf (sumbuf, 32, "%.2f", best); - cur = g_list_prepend (NULL, sumbuf); - insert_result (task, data.statfiles[best_num].st->symbol, best, cur); + if (total_ham > total_spam) { + rspamd_snprintf (sumbuf, 32, "%.2f", total_ham); + cur = g_list_prepend (NULL, sumbuf); + insert_result (task, data.statfiles[best_num_ham].st->symbol, total_ham, cur); + } + else { + rspamd_snprintf (sumbuf, 32, "%.2f", total_spam); + cur = g_list_prepend (NULL, sumbuf); + insert_result (task, data.statfiles[best_num_spam].st->symbol, total_spam, cur); + } } g_free (data.statfiles); diff --git a/src/client/rspamc.c b/src/client/rspamc.c index aae8345d5..74423bbf3 100644 --- a/src/client/rspamc.c +++ b/src/client/rspamc.c @@ -352,7 +352,7 @@ scan_rspamd_stdin (void) GHashTable *opts; /* Init options hash */ - opts = g_hash_table_new (g_str_hash, g_str_equal); + opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); add_options (opts); /* Add server */ add_rspamd_server (FALSE); @@ -390,7 +390,7 @@ scan_rspamd_file (const gchar *file) /* Add server */ add_rspamd_server (FALSE); /* Init options hash */ - opts = g_hash_table_new (g_str_hash, g_str_equal); + opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); add_options (opts); res = rspamd_scan_file (client, file, opts, &err); g_hash_table_destroy (opts); @@ -435,7 +435,7 @@ learn_rspamd_stdin (gboolean is_spam) } } - params = g_hash_table_new (g_str_hash, g_str_equal); + params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); g_hash_table_insert (params, "Classifier", classifier); results = rspamd_controller_command_memory (client, is_spam ? "learn_spam" : "learn_ham", password, params, in_buf, r, &err); @@ -482,7 +482,7 @@ learn_rspamd_file (gboolean is_spam, const gchar *file) /* Add server */ add_rspamd_server (TRUE); - params = g_hash_table_new (g_str_hash, g_str_equal); + params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); g_hash_table_insert (params, "Classifier", classifier); results = rspamd_controller_command_file (client, is_spam ? "learn_spam" : "learn_ham", password, params, file, &err); @@ -525,7 +525,7 @@ fuzzy_rspamd_stdin (gboolean delete) gchar valuebuf[sizeof("65535")], flagbuf[sizeof("65535")]; struct rspamd_controller_result *res; - params = g_hash_table_new (g_str_hash, g_str_equal); + params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); rspamd_snprintf (valuebuf, sizeof (valuebuf), "%d", weight); rspamd_snprintf (flagbuf, sizeof (flagbuf), "%d", flag); g_hash_table_insert (params, "Value", valuebuf); @@ -588,7 +588,7 @@ fuzzy_rspamd_file (const gchar *file, gboolean delete) /* Add server */ add_rspamd_server (TRUE); - params = g_hash_table_new (g_str_hash, g_str_equal); + params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); rspamd_snprintf (valuebuf, sizeof (valuebuf), "%d", weight); rspamd_snprintf (flagbuf, sizeof (flagbuf), "%d", flag); g_hash_table_insert (params, "Value", valuebuf); @@ -671,7 +671,7 @@ main (gint argc, gchar **argv, gchar **env) GHashTable *kwattrs; - kwattrs = g_hash_table_new (g_str_hash, g_str_equal); + kwattrs = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); read_cmd_line (&argc, &argv); diff --git a/src/expressions.c b/src/expressions.c index 1f04843bd..f3356a218 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -87,7 +87,7 @@ re_cache_check (const gchar *line, memory_pool_t *pool) re_cache = memory_pool_get_variable (pool, "re_cache"); if (re_cache == NULL) { - re_cache = g_hash_table_new (g_str_hash, g_str_equal); + re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); memory_pool_set_variable (pool, "re_cache", re_cache, (pool_destruct_func)g_hash_table_destroy); return NULL; } @@ -102,7 +102,7 @@ re_cache_add (const gchar *line, void *pointer, memory_pool_t *pool) re_cache = memory_pool_get_variable (pool, "re_cache"); if (re_cache == NULL) { - re_cache = g_hash_table_new (g_str_hash, g_str_equal); + re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); memory_pool_set_variable (pool, "re_cache", re_cache, (pool_destruct_func)g_hash_table_destroy); } diff --git a/src/filter.c b/src/filter.c index d26a34af0..bbe3b4678 100644 --- a/src/filter.c +++ b/src/filter.c @@ -75,7 +75,7 @@ insert_metric_result (struct worker_task *task, struct metric *metric, const gch if (metric_res == NULL) { /* Create new metric chain */ metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result)); - metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal); + metric_res->symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); metric_res->checked = FALSE; memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_hash_table_unref, metric_res->symbols); metric_res->metric = metric; diff --git a/src/lmtp.c b/src/lmtp.c index 4e7373158..67743e92a 100644 --- a/src/lmtp.c +++ b/src/lmtp.c @@ -271,7 +271,7 @@ accept_socket (gint fd, short what, void *arg) new_task->task_pool = memory_pool_new (memory_pool_get_size ()); /* Add destructor for recipients list (it would be better to use anonymous function here */ memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) rcpt_destruct, new_task); - new_task->results = g_hash_table_new (g_str_hash, g_str_equal); + new_task->results = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); new_task->ev_base = worker->ctx; memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) g_hash_table_destroy, new_task->results); worker->srv->stat->connections_count++; diff --git a/src/lua_worker.c b/src/lua_worker.c index c75d275a6..c4276968a 100644 --- a/src/lua_worker.c +++ b/src/lua_worker.c @@ -410,7 +410,7 @@ init_lua_worker (void) type = g_quark_try_string ("lua"); ctx = g_malloc0 (sizeof (struct rspamd_lua_worker_ctx)); - ctx->params = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, (GDestroyNotify)g_list_free); + ctx->params = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, (GDestroyNotify)g_list_free); register_worker_opt (type, "file", xml_handle_string, ctx, G_STRUCT_OFFSET (struct rspamd_lua_worker_ctx, file)); diff --git a/src/main.c b/src/main.c index a5cad0ba9..d28672e83 100644 --- a/src/main.c +++ b/src/main.c @@ -878,7 +878,7 @@ main (gint argc, gchar **argv, gchar **env) } /* Init counters */ - rspamd_main->counters = rspamd_hash_new_shared (rspamd_main->server_pool, g_str_hash, g_str_equal, 64); + rspamd_main->counters = rspamd_hash_new_shared (rspamd_main->server_pool, rspamd_str_hash, rspamd_str_equal, 64); /* Init listen sockets hash */ listen_sockets = g_hash_table_new (g_direct_hash, g_direct_equal); @@ -893,7 +893,7 @@ main (gint argc, gchar **argv, gchar **env) rspamd_main->cfg->cache = g_new0 (struct symbols_cache, 1); rspamd_main->cfg->cache->static_pool = memory_pool_new (memory_pool_get_size ()); rspamd_main->cfg->cache->cfg = rspamd_main->cfg; - rspamd_main->cfg->cache->items_by_symbol = g_hash_table_new (g_str_hash, g_str_equal); + rspamd_main->cfg->cache->items_by_symbol = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); /* Load config */ if (! load_rspamd_config (rspamd_main->cfg, TRUE)) { diff --git a/src/mem_pool.c b/src/mem_pool.c index f7636e538..5ab582352 100644 --- a/src/mem_pool.c +++ b/src/mem_pool.c @@ -746,7 +746,7 @@ void memory_pool_set_variable (memory_pool_t *pool, const gchar *name, gpointer value, pool_destruct_func destructor) { if (pool->variables == NULL) { - pool->variables = g_hash_table_new (g_str_hash, g_str_equal); + pool->variables = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); } g_hash_table_insert (pool->variables, memory_pool_strdup (pool, name), value); diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index f5c94bd1f..ebb4ff648 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -514,7 +514,7 @@ regexp_module_init (struct config_file *cfg, struct module_ctx **ctx) regexp_module_ctx->filter = regexp_common_filter; regexp_module_ctx->regexp_pool = memory_pool_new (memory_pool_get_size ()); regexp_module_ctx->dynamic_pool = NULL; - regexp_module_ctx->autolearn_symbols = g_hash_table_new (g_str_hash, g_str_equal); + regexp_module_ctx->autolearn_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); regexp_module_ctx->workers = NULL; *ctx = (struct module_ctx *)regexp_module_ctx; diff --git a/src/settings.c b/src/settings.c index d21fe02d0..0d856bf3f 100644 --- a/src/settings.c +++ b/src/settings.c @@ -81,12 +81,12 @@ settings_ref (struct rspamd_settings *s) { if (s == NULL) { s = g_slice_alloc (sizeof (struct rspamd_settings)); - s->metric_scores = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); - s->reject_scores = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); - s->metric_actions = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, settings_actions_free); - s->factors = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); - s->whitelist = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); - s->blacklist = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free); + s->metric_scores = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free); + s->reject_scores = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free); + s->metric_actions = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, settings_actions_free); + s->factors = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free); + s->whitelist = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free); + s->blacklist = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free); s->statfile_alias = NULL; s->want_spam = FALSE; s->ref_count = 1; diff --git a/src/symbols_cache.c b/src/symbols_cache.c index 9b8d57026..b273ea1c9 100644 --- a/src/symbols_cache.c +++ b/src/symbols_cache.c @@ -271,7 +271,7 @@ register_symbol_common (struct symbols_cache **cache, const gchar *name, double pcache = g_new0 (struct symbols_cache, 1); *cache = pcache; pcache->static_pool = memory_pool_new (memory_pool_get_size ()); - pcache->items_by_symbol = g_hash_table_new (g_str_hash, g_str_equal); + pcache->items_by_symbol = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); } item = memory_pool_alloc0 (pcache->static_pool, sizeof (struct cache_item)); diff --git a/src/util.c b/src/util.c index 30c85ddf2..707f4c449 100644 --- a/src/util.c +++ b/src/util.c @@ -947,6 +947,7 @@ calculate_check_time (struct timeval *begin, gint resolution) # define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x)) #endif + gboolean rspamd_strcase_equal (gconstpointer v, gconstpointer v2) { @@ -962,16 +963,43 @@ guint rspamd_strcase_hash (gconstpointer key) { const gchar *p = key; - guint h = 0; + gchar buf[256]; + guint h = 0, i = 0; + while (*p != '\0') { - h = (h << 5) - h + g_tolower (*p); + buf[i] = g_ascii_tolower (*p); + i++; p++; + if (i == sizeof (buf)) { + h ^= murmur32_hash (buf, i); + i = 0; + } + } + + if (i > 0) { + h ^= murmur32_hash (buf, i); } return h; } +guint +rspamd_str_hash (gconstpointer key) +{ + gsize len; + + len = strlen ((const gchar *)key); + + return murmur32_hash (key, len); +} + +gboolean +rspamd_str_equal (gconstpointer v, gconstpointer v2) +{ + return strcmp ((const gchar *)v, (const gchar *)v2) == 0; +} + gboolean fstr_strcase_equal (gconstpointer v, gconstpointer v2) { @@ -987,14 +1015,24 @@ fstr_strcase_equal (gconstpointer v, gconstpointer v2) guint fstr_strcase_hash (gconstpointer key) { - const f_str_t *f = key; + const f_str_t *f = key; const gchar *p; - guint h = 0; + guint h = 0, i = 0; + gchar buf[256]; p = f->begin; while (p - f->begin < (gint)f->len) { - h = (h << 5) - h + g_tolower (*p); + buf[i] = g_ascii_tolower (*p); + i++; p++; + if (i == sizeof (buf)) { + h ^= murmur32_hash (buf, i); + i = 0; + } + } + + if (i > 0) { + h ^= murmur32_hash (buf, i); } return h; @@ -1560,6 +1598,176 @@ rspamd_create_thread (const gchar *name, GThreadFunc func, gpointer data, GError return new; } +guint32 +murmur32_hash (const guint8 *in, gsize len) +{ + + + const guint32 c1 = 0xcc9e2d51; + const guint32 c2 = 0x1b873593; + + const int nblocks = len / 4; + const guint32 *blocks = (const guint32 *)(in); + const guint8 *tail; + guint32 h = 0; + gint i; + guint32 k; + + if (in == NULL || len == 0) { + return 0; + } + + tail = (const guint8 *)(in + (nblocks * 4)); + + for (i = 0; i < nblocks; i++) { + k = blocks[i]; + + k *= c1; + k = (k << 15) | (k >> (32 - 15)); + k *= c2; + + h ^= k; + h = (h << 13) | (h >> (32 - 13)); + h = (h * 5) + 0xe6546b64; + } + + k = 0; + switch (len & 3) { + case 3: + k ^= tail[2] << 16; + case 2: + k ^= tail[1] << 8; + case 1: + k ^= tail[0]; + k *= c1; + k = (k << 13) | (k >> (32 - 15)); + k *= c2; + h ^= k; + }; + + h ^= len; + + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +void +murmur128_hash (const guint8 *in, gsize len, guint64 out[]) +{ + const guint64 c1 = 0x87c37b91114253d5ULL; + const guint64 c2 = 0x4cf5ad432745937fULL; + const gint nblocks = len / 16; + const guint64 *blocks = (const guint64 *)(in); + const guint8 *tail; + guint64 h1 = 0; + guint64 h2 = 0; + int i; + guint64 k1, k2; + + if (in == NULL || len == 0 || out == NULL) { + return; + } + + tail = (const guint8 *)(in + (nblocks * 16)); + + for (i = 0; i < nblocks; i++) { + k1 = blocks[i*2+0]; + k2 = blocks[i*2+1]; + + k1 *= c1; + k1 = (k1 << 31) | (k1 >> (64 - 31)); + k1 *= c2; + h1 ^= k1; + + h1 = (h1 << 27) | (h1 >> (64 - 27)); + h1 += h2; + h1 = h1*5+0x52dce729; + + k2 *= c2; + k2 = (k2 << 33) | (k2 >> (64 - 33)); + k2 *= c1; + h2 ^= k2; + + h2 = (h2 << 31) | (h2 >> (64 - 31)); + h2 += h1; + h2 = h2*5+0x38495ab5; + } + + k1 = k2 = 0; + switch (len & 15) { + case 15: + k2 ^= (guint64)(tail[14]) << 48; + case 14: + k2 ^= (guint64)(tail[13]) << 40; + case 13: + k2 ^= (guint64)(tail[12]) << 32; + case 12: + k2 ^= (guint64)(tail[11]) << 24; + case 11: + k2 ^= (guint64)(tail[10]) << 16; + case 10: + k2 ^= (guint64)(tail[ 9]) << 8; + case 9: + k2 ^= (guint64)(tail[ 8]) << 0; + k2 *= c2; + k2 = (k2 << 33) | (k2 >> (64 - 33)); + k2 *= c1; + h2 ^= k2; + + case 8: + k1 ^= (guint64)(tail[ 7]) << 56; + case 7: + k1 ^= (guint64)(tail[ 6]) << 48; + case 6: + k1 ^= (guint64)(tail[ 5]) << 40; + case 5: + k1 ^= (guint64)(tail[ 4]) << 32; + case 4: + k1 ^= (guint64)(tail[ 3]) << 24; + case 3: + k1 ^= (guint64)(tail[ 2]) << 16; + case 2: + k1 ^= (guint64)(tail[ 1]) << 8; + case 1: + k1 ^= (guint64)(tail[ 0]) << 0; + k1 *= c1; + k1 = (k1 << 31) | (k1 >> (64 - 31)); + k1 *= c2; + h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 ^= h1 >> 33; + h1 *= 0xff51afd7ed558ccdULL; + h1 ^= h1 >> 33; + h1 *= 0xc4ceb9fe1a85ec53ULL; + h1 ^= h1 >> 33; + + h2 ^= h2 >> 33; + h2 *= 0xff51afd7ed558ccdULL; + h2 ^= h2 >> 33; + h2 *= 0xc4ceb9fe1a85ec53ULL; + h2 ^= h2 >> 33; + + h1 += h2; + h2 += h1; + + out[0] = h1; + out[1] = h2; +} /* * vi:ts=4 diff --git a/src/util.h b/src/util.h index 961e75645..c8fd8158a 100644 --- a/src/util.h +++ b/src/util.h @@ -145,6 +145,13 @@ gboolean unlock_file (gint fd, gboolean async); guint rspamd_strcase_hash (gconstpointer key); gboolean rspamd_strcase_equal (gconstpointer v, gconstpointer v2); +/* + * Hash table utility functions for case sensitive hashing + */ +guint rspamd_str_hash (gconstpointer key); +gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2); + + /* * Hash table utility functions for hashing fixed strings */ @@ -317,4 +324,39 @@ void rspamd_rwlock_free (rspamd_rwlock_t *mtx); */ GThread* rspamd_create_thread (const gchar *name, GThreadFunc func, gpointer data, GError **err); +/** + * Return 32bit murmur hash value for specified input + * @param in input data + * @param len length of the input data + * @code + * MurmurHash3 was created by Austin Appleby in 2008. The cannonical + * implementations are in C++ and placed in the public. + * + * https://sites.google.com/site/murmurhash/ + * + * Seungyoung Kim has ported it's cannonical implementation to C language + * in 2012 and published it as a part of qLibc component. + * @endcode + * @return + */ +guint32 murmur32_hash (const guint8 *in, gsize len); + +/** + * Return 32bit murmur hash value for specified input + * @param in input data + * @param len length of the input data + * @param out array of 2 guint64 variables + * @code + * MurmurHash3 was created by Austin Appleby in 2008. The cannonical + * implementations are in C++ and placed in the public. + * + * https://sites.google.com/site/murmurhash/ + * + * Seungyoung Kim has ported it's cannonical implementation to C language + * in 2012 and published it as a part of qLibc component. + * @endcode + * @return + */ +void murmur128_hash (const guint8 *in, gsize len, guint64 out[]); + #endif diff --git a/src/worker_util.c b/src/worker_util.c index 976fe58c3..541d4f1e4 100644 --- a/src/worker_util.c +++ b/src/worker_util.c @@ -74,11 +74,11 @@ construct_task (struct rspamd_worker *worker) /* Add destructor for recipients list (it would be better to use anonymous function here */ memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) rcpt_destruct, new_task); - new_task->results = g_hash_table_new (g_str_hash, g_str_equal); + new_task->results = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) g_hash_table_destroy, new_task->results); - new_task->re_cache = g_hash_table_new (g_str_hash, g_str_equal); + new_task->re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal); memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) g_hash_table_destroy, new_task->re_cache); -- 2.39.5