]> source.dussan.org Git - rspamd.git/commitdiff
* Use murmur hash for all hashes as it is more efficient and provides more uniform...
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 8 Oct 2012 17:21:53 +0000 (21:21 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 8 Oct 2012 17:21:53 +0000 (21:21 +0400)
* Fix probability renormalization while using advanced classification.

16 files changed:
src/cfg_utils.c
src/cfg_xml.c
src/classifiers/bayes.c
src/client/rspamc.c
src/expressions.c
src/filter.c
src/lmtp.c
src/lua_worker.c
src/main.c
src/mem_pool.c
src/plugins/regexp.c
src/settings.c
src/symbols_cache.c
src/util.c
src/util.h
src/worker_util.c

index 7dfe54462cea35c3f7d6c9abf37cb98bee1fb02b..37f28c8223d49627bc8118aef6340194bfa0058e 100644 (file)
@@ -215,14 +215,14 @@ init_defaults (struct config_file *cfg)
        cfg->max_diff = 20480;
 
        cfg->max_statfile_size = DEFAULT_STATFILE_SIZE;
-       cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal);
-       cfg->variables = g_hash_table_new (g_str_hash, g_str_equal);
-       cfg->metrics = g_hash_table_new (g_str_hash, g_str_equal);
-       cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal);
-       cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal);
-       cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal);
-       cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal);
-       cfg->metrics_symbols = g_hash_table_new (g_str_hash, g_str_equal);
+       cfg->modules_opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+       cfg->variables = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+       cfg->metrics = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+       cfg->c_modules = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+       cfg->composite_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+       cfg->classifiers_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+       cfg->cfg_params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+       cfg->metrics_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
 
        cfg->map_timeout = DEFAULT_MAP_TIMEOUT;
 
@@ -825,11 +825,11 @@ check_classifier_conf (struct config_file *cfg, struct classifier_config *c)
                c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config));
        }
        if (c->opts == NULL) {
-               c->opts = g_hash_table_new (g_str_hash, g_str_equal);
+               c->opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->opts);
        }
        if (c->labels == NULL) {
-               c->labels = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, (GDestroyNotify)g_list_free);
+               c->labels = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, NULL, (GDestroyNotify)g_list_free);
                memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->labels);
        }
 
@@ -843,7 +843,7 @@ check_statfile_conf (struct config_file *cfg, struct statfile *c)
                c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
        }
        if (c->opts == NULL) {
-               c->opts = g_hash_table_new (g_str_hash, g_str_equal);
+               c->opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->opts);
        }
 
@@ -857,8 +857,8 @@ check_metric_conf (struct config_file *cfg, struct metric *c)
                c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct metric));
                c->action = METRIC_ACTION_REJECT;
                c->grow_factor = 1.0;
-               c->symbols = g_hash_table_new (g_str_hash, g_str_equal);
-               c->descriptions = g_hash_table_new (g_str_hash, g_str_equal);
+               c->symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+               c->descriptions = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->symbols);
                memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->descriptions);
        }
@@ -871,7 +871,7 @@ check_worker_conf (struct config_file *cfg, struct worker_conf *c)
 {
        if (c == NULL) {
                c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct worker_conf));
-               c->params = g_hash_table_new (g_str_hash, g_str_equal);
+               c->params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                c->active_workers = g_queue_new ();
                memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->params);
                memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_queue_free, c->active_workers);
index 99aad12ab8cff0ad6792590e9ca27e578b9c7ce2..277172f2b50c81b6d536fac685a97a3cf9b7b616 100644 (file)
@@ -2310,10 +2310,10 @@ register_module_opt (const gchar *mname, const gchar *optname, enum module_opt_t
        GHashTable                       *module;
 
        if (module_options == NULL) {
-               module_options = g_hash_table_new (g_str_hash, g_str_equal);
+               module_options = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        }
        if ((module = g_hash_table_lookup (module_options, mname)) == NULL) {
-               module = g_hash_table_new (g_str_hash, g_str_equal);
+               module = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                g_hash_table_insert (module_options, (char *)mname, module);
        }
        if ((param = g_hash_table_lookup (module, optname)) == NULL) {
@@ -2348,7 +2348,7 @@ register_worker_opt (gint wtype, const gchar *optname, element_handler_func func
                worker_options = g_hash_table_new (g_int_hash, g_int_equal);
        }
        if ((worker = g_hash_table_lookup (worker_options, &wtype)) == NULL) {
-               worker = g_hash_table_new (g_str_hash, g_str_equal);
+               worker = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                new_key = g_malloc (sizeof (gint));
                *new_key = wtype;
                g_hash_table_insert (worker_options, new_key, worker);
@@ -2372,10 +2372,10 @@ register_classifier_opt (const gchar *ctype, const gchar *optname)
        GHashTable                       *classifier;
 
        if (classifier_options == NULL) {
-               classifier_options = g_hash_table_new (g_str_hash, g_str_equal);
+               classifier_options = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        }
        if ((classifier = g_hash_table_lookup (classifier_options, ctype)) == NULL) {
-               classifier = g_hash_table_new (g_str_hash, g_str_equal);
+               classifier = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                g_hash_table_insert (classifier_options, (char *)ctype, classifier);
        }
        if ((param = g_hash_table_lookup (classifier, optname)) == NULL) {
@@ -2406,7 +2406,7 @@ register_subparser (const gchar *tag, enum xml_read_state state, const GMarkupPa
        struct xml_subparser                     *subparser;
 
        if (subparsers == NULL) {
-               subparsers = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
+               subparsers = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
        }
        subparser = g_malloc (sizeof (struct xml_subparser));
 
index a80bbe0ba32a7296ca08d9f0e697017904191cdc..0e68f5d72327378f7908761ff0731c2d204cd739 100644 (file)
@@ -182,10 +182,10 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
 {
        struct bayes_callback_data      data;
        gchar                          *value;
-       gint                            nodes, i = 0, cnt, best_num = 0;
+       gint                            nodes, i = 0, cnt, best_num_spam = 0, best_num_ham = 0;
        gint                            minnodes;
        guint64                         rev, total_learns = 0;
-       double                          best = 0;
+       double                          best_spam = 0., best_ham = 0., total_spam = 0., total_ham = 0.;
        struct statfile                *st;
        stat_file_t                    *file;
        GList                          *cur;
@@ -262,17 +262,37 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
 
        for (i = 0; i < cnt; i ++) {
                debug_task ("got probability for symbol %s: %.2f", data.statfiles[i].st->symbol, data.statfiles[i].post_probability);
-               if (data.statfiles[i].post_probability > best) {
-                       best = data.statfiles[i].post_probability;
-                       best_num = i;
+
+               if (data.statfiles[i].st->is_spam) {
+                       total_spam += data.statfiles[i].post_probability;
+                       if (data.statfiles[i].post_probability > best_spam) {
+                               best_spam = data.statfiles[i].post_probability;
+                               best_num_spam = i;
+                       }
+               }
+               else {
+                       total_ham += data.statfiles[i].post_probability;
+                       if (data.statfiles[i].post_probability > best_ham) {
+                               best_ham = data.statfiles[i].post_probability;
+                               best_num_ham = i;
+                       }
                }
        }
 
-       if (best > 0.5) {
+
+       if (total_ham > 0.5 || total_spam > 0.5) {
+
                sumbuf = memory_pool_alloc (task->task_pool, 32);
-               rspamd_snprintf (sumbuf, 32, "%.2f", best);
-               cur = g_list_prepend (NULL, sumbuf);
-               insert_result (task, data.statfiles[best_num].st->symbol, best, cur);
+               if (total_ham > total_spam) {
+                       rspamd_snprintf (sumbuf, 32, "%.2f", total_ham);
+                       cur = g_list_prepend (NULL, sumbuf);
+                       insert_result (task, data.statfiles[best_num_ham].st->symbol, total_ham, cur);
+               }
+               else {
+                       rspamd_snprintf (sumbuf, 32, "%.2f", total_spam);
+                       cur = g_list_prepend (NULL, sumbuf);
+                       insert_result (task, data.statfiles[best_num_spam].st->symbol, total_spam, cur);
+               }
        }
 
        g_free (data.statfiles);
index aae8345d5952270887377a33b2d4ed872eaba545..74423bbf3bf13002c2044ed6feb0a36b624d2a43 100644 (file)
@@ -352,7 +352,7 @@ scan_rspamd_stdin (void)
        GHashTable                      *opts;
 
        /* Init options hash */
-       opts = g_hash_table_new (g_str_hash, g_str_equal);
+       opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        add_options (opts);
        /* Add server */
        add_rspamd_server (FALSE);
@@ -390,7 +390,7 @@ scan_rspamd_file (const gchar *file)
        /* Add server */
        add_rspamd_server (FALSE);
        /* Init options hash */
-       opts = g_hash_table_new (g_str_hash, g_str_equal);
+       opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        add_options (opts);
        res = rspamd_scan_file (client, file, opts, &err);
        g_hash_table_destroy (opts);
@@ -435,7 +435,7 @@ learn_rspamd_stdin (gboolean is_spam)
                }
        }
 
-       params = g_hash_table_new (g_str_hash, g_str_equal);
+       params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        g_hash_table_insert (params, "Classifier", classifier);
 
        results = rspamd_controller_command_memory (client, is_spam ? "learn_spam" : "learn_ham", password, params, in_buf, r, &err);
@@ -482,7 +482,7 @@ learn_rspamd_file (gboolean is_spam, const gchar *file)
 
        /* Add server */
        add_rspamd_server (TRUE);
-       params = g_hash_table_new (g_str_hash, g_str_equal);
+       params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        g_hash_table_insert (params, "Classifier", classifier);
 
        results = rspamd_controller_command_file (client, is_spam ? "learn_spam" : "learn_ham", password, params, file, &err);
@@ -525,7 +525,7 @@ fuzzy_rspamd_stdin (gboolean delete)
        gchar                                                    valuebuf[sizeof("65535")], flagbuf[sizeof("65535")];
        struct rspamd_controller_result *res;
 
-       params = g_hash_table_new (g_str_hash, g_str_equal);
+       params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        rspamd_snprintf (valuebuf, sizeof (valuebuf), "%d", weight);
        rspamd_snprintf (flagbuf, sizeof (flagbuf), "%d", flag);
        g_hash_table_insert (params, "Value", valuebuf);
@@ -588,7 +588,7 @@ fuzzy_rspamd_file (const gchar *file, gboolean delete)
        /* Add server */
        add_rspamd_server (TRUE);
 
-       params = g_hash_table_new (g_str_hash, g_str_equal);
+       params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        rspamd_snprintf (valuebuf, sizeof (valuebuf), "%d", weight);
        rspamd_snprintf (flagbuf, sizeof (flagbuf), "%d", flag);
        g_hash_table_insert (params, "Value", valuebuf);
@@ -671,7 +671,7 @@ main (gint argc, gchar **argv, gchar **env)
        GHashTable                                              *kwattrs;
 
 
-       kwattrs = g_hash_table_new (g_str_hash, g_str_equal);
+       kwattrs = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
 
        read_cmd_line (&argc, &argv);
 
index 1f04843bd4a8f1f4c298a8dbe18696c9b47500e0..f3356a21897a6a97b21be3a7f371600c09e58c20 100644 (file)
@@ -87,7 +87,7 @@ re_cache_check (const gchar *line, memory_pool_t *pool)
        re_cache = memory_pool_get_variable (pool, "re_cache");
 
        if (re_cache == NULL) {
-               re_cache = g_hash_table_new (g_str_hash, g_str_equal);
+               re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                memory_pool_set_variable (pool, "re_cache", re_cache, (pool_destruct_func)g_hash_table_destroy);
                return NULL;
        }
@@ -102,7 +102,7 @@ re_cache_add (const gchar *line, void *pointer, memory_pool_t *pool)
        re_cache = memory_pool_get_variable (pool, "re_cache");
 
        if (re_cache == NULL) {
-               re_cache = g_hash_table_new (g_str_hash, g_str_equal);
+               re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                memory_pool_set_variable (pool, "re_cache", re_cache, (pool_destruct_func)g_hash_table_destroy);
        }
 
index d26a34af019af4cb1028116003b4976832740331..bbe3b46781a4503535a43f6218ce110b894e0ae1 100644 (file)
@@ -75,7 +75,7 @@ insert_metric_result (struct worker_task *task, struct metric *metric, const gch
        if (metric_res == NULL) {
                /* Create new metric chain */
                metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result));
-               metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal);
+               metric_res->symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
                metric_res->checked = FALSE;
                memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_hash_table_unref, metric_res->symbols);
                metric_res->metric = metric;
index 4e7373158e5a228c003d678efe1abf90770da875..67743e92a4f08447dcef80b043ef7738ebc8d1c7 100644 (file)
@@ -271,7 +271,7 @@ accept_socket (gint fd, short what, void *arg)
        new_task->task_pool = memory_pool_new (memory_pool_get_size ());
        /* Add destructor for recipients list (it would be better to use anonymous function here */
        memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) rcpt_destruct, new_task);
-       new_task->results = g_hash_table_new (g_str_hash, g_str_equal);
+       new_task->results = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        new_task->ev_base = worker->ctx;
        memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) g_hash_table_destroy, new_task->results);
        worker->srv->stat->connections_count++;
index c75d275a6cd6e5d64031c6d87793bc06785f26b6..c4276968a89c20961774fc5f874943ccaf9cecd0 100644 (file)
@@ -410,7 +410,7 @@ init_lua_worker (void)
        type = g_quark_try_string ("lua");
 
        ctx = g_malloc0 (sizeof (struct rspamd_lua_worker_ctx));
-       ctx->params = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, (GDestroyNotify)g_list_free);
+       ctx->params = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, (GDestroyNotify)g_list_free);
 
 
        register_worker_opt (type, "file", xml_handle_string, ctx, G_STRUCT_OFFSET (struct rspamd_lua_worker_ctx, file));
index a5cad0ba9af9e57a4fd5b585daf29853ae31dedd..d28672e83b0cd83af4e6433a12638bd2f08fbe3d 100644 (file)
@@ -878,7 +878,7 @@ main (gint argc, gchar **argv, gchar **env)
        }
 
        /* Init counters */
-       rspamd_main->counters = rspamd_hash_new_shared (rspamd_main->server_pool, g_str_hash, g_str_equal, 64);
+       rspamd_main->counters = rspamd_hash_new_shared (rspamd_main->server_pool, rspamd_str_hash, rspamd_str_equal, 64);
        /* Init listen sockets hash */
        listen_sockets = g_hash_table_new (g_direct_hash, g_direct_equal);
 
@@ -893,7 +893,7 @@ main (gint argc, gchar **argv, gchar **env)
        rspamd_main->cfg->cache = g_new0 (struct symbols_cache, 1);
        rspamd_main->cfg->cache->static_pool = memory_pool_new (memory_pool_get_size ());
        rspamd_main->cfg->cache->cfg = rspamd_main->cfg;
-       rspamd_main->cfg->cache->items_by_symbol = g_hash_table_new (g_str_hash, g_str_equal);
+       rspamd_main->cfg->cache->items_by_symbol = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
 
        /* Load config */
        if (! load_rspamd_config (rspamd_main->cfg, TRUE)) {
index f7636e5388c8641361bf9c82bcff8ba4dc8accf3..5ab582352da50d9f2381744c3a24e04838c14468 100644 (file)
@@ -746,7 +746,7 @@ void
 memory_pool_set_variable (memory_pool_t *pool, const gchar *name, gpointer value, pool_destruct_func destructor)
 {
        if (pool->variables == NULL) {
-               pool->variables = g_hash_table_new (g_str_hash, g_str_equal);
+               pool->variables = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        }
 
        g_hash_table_insert (pool->variables, memory_pool_strdup (pool, name), value);
index f5c94bd1f4fb597e8ea6a08781e0f7ebc3b3c7d2..ebb4ff64829f0095c1e4d7be0829288818e2de8b 100644 (file)
@@ -514,7 +514,7 @@ regexp_module_init (struct config_file *cfg, struct module_ctx **ctx)
        regexp_module_ctx->filter = regexp_common_filter;
        regexp_module_ctx->regexp_pool = memory_pool_new (memory_pool_get_size ());
        regexp_module_ctx->dynamic_pool = NULL;
-       regexp_module_ctx->autolearn_symbols = g_hash_table_new (g_str_hash, g_str_equal);
+       regexp_module_ctx->autolearn_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        regexp_module_ctx->workers = NULL;
 
        *ctx = (struct module_ctx *)regexp_module_ctx;
index d21fe02d055776e4c794a85b8c4ea3cbb99d8e0d..0d856bf3f7d9efb0267db62726c7a89dd20d2612 100644 (file)
@@ -81,12 +81,12 @@ settings_ref (struct rspamd_settings *s)
 {
        if (s == NULL) {
                s = g_slice_alloc (sizeof (struct rspamd_settings));
-               s->metric_scores = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
-               s->reject_scores = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
-               s->metric_actions = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, settings_actions_free);
-               s->factors = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
-               s->whitelist = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
-               s->blacklist = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
+               s->metric_scores = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+               s->reject_scores = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+               s->metric_actions = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, settings_actions_free);
+               s->factors = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+               s->whitelist = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+               s->blacklist = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
                s->statfile_alias = NULL;
                s->want_spam = FALSE;
                s->ref_count = 1;
index 9b8d5702619653a3d6047910a163bed076c48cb1..b273ea1c950def9f8c32286857f94c32daab3a0c 100644 (file)
@@ -271,7 +271,7 @@ register_symbol_common (struct symbols_cache **cache, const gchar *name, double
                pcache = g_new0 (struct symbols_cache, 1);
                *cache = pcache;
                pcache->static_pool = memory_pool_new (memory_pool_get_size ());
-               pcache->items_by_symbol = g_hash_table_new (g_str_hash, g_str_equal);
+               pcache->items_by_symbol = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        }
        
        item = memory_pool_alloc0 (pcache->static_pool, sizeof (struct cache_item));
index 30c85ddf25d0b8b7482c3f50782aaad5a038e655..707f4c449f6baa4243f092f6a3ef3e77ff1bd501 100644 (file)
@@ -947,6 +947,7 @@ calculate_check_time (struct timeval *begin, gint resolution)
 #   define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x))
 #endif
 
+
 gboolean
 rspamd_strcase_equal (gconstpointer v, gconstpointer v2)
 {
@@ -962,16 +963,43 @@ guint
 rspamd_strcase_hash (gconstpointer key)
 {
        const gchar                     *p = key;
-       guint                           h = 0;
+       gchar                                                    buf[256];
+       guint                            h = 0, i = 0;
+
 
        while (*p != '\0') {
-               h = (h << 5) - h + g_tolower (*p);
+               buf[i] = g_ascii_tolower (*p);
+               i++;
                p++;
+               if (i == sizeof (buf)) {
+                       h ^= murmur32_hash (buf, i);
+                       i = 0;
+               }
+       }
+
+       if (i > 0) {
+               h ^= murmur32_hash (buf, i);
        }
 
        return h;
 }
 
+guint
+rspamd_str_hash (gconstpointer key)
+{
+       gsize                                                   len;
+
+       len = strlen ((const gchar *)key);
+
+       return murmur32_hash (key, len);
+}
+
+gboolean
+rspamd_str_equal (gconstpointer v, gconstpointer v2)
+{
+       return strcmp ((const gchar *)v, (const gchar *)v2) == 0;
+}
+
 gboolean
 fstr_strcase_equal (gconstpointer v, gconstpointer v2)
 {
@@ -987,14 +1015,24 @@ fstr_strcase_equal (gconstpointer v, gconstpointer v2)
 guint
 fstr_strcase_hash (gconstpointer key)
 {
-       const f_str_t                  *f = key;
+       const f_str_t                   *f = key;
        const gchar                     *p;
-       guint                           h = 0;
+       guint                            h = 0, i = 0;
+       gchar                                                    buf[256];
        
        p = f->begin;
        while (p - f->begin < (gint)f->len) {
-               h = (h << 5) - h + g_tolower (*p);
+               buf[i] = g_ascii_tolower (*p);
+               i++;
                p++;
+               if (i == sizeof (buf)) {
+                       h ^= murmur32_hash (buf, i);
+                       i = 0;
+               }
+       }
+
+       if (i > 0) {
+               h ^= murmur32_hash (buf, i);
        }
 
        return h;
@@ -1560,6 +1598,176 @@ rspamd_create_thread (const gchar *name, GThreadFunc func, gpointer data, GError
        return new;
 }
 
+guint32
+murmur32_hash (const guint8 *in, gsize len)
+{
+
+
+       const guint32                    c1 = 0xcc9e2d51;
+       const guint32                    c2 = 0x1b873593;
+
+       const int                                nblocks = len / 4;
+       const guint32                   *blocks = (const guint32 *)(in);
+       const guint8                    *tail;
+       guint32                                  h = 0;
+       gint                                     i;
+       guint32                                  k;
+
+       if (in == NULL || len == 0) {
+               return 0;
+       }
+
+       tail = (const guint8 *)(in + (nblocks * 4));
+
+       for (i = 0; i < nblocks; i++) {
+               k = blocks[i];
+
+               k *= c1;
+               k = (k << 15) | (k >> (32 - 15));
+               k *= c2;
+
+               h ^= k;
+               h = (h << 13) | (h >> (32 - 13));
+               h = (h * 5) + 0xe6546b64;
+       }
+
+       k = 0;
+       switch (len & 3) {
+       case 3:
+               k ^= tail[2] << 16;
+       case 2:
+               k ^= tail[1] << 8;
+       case 1:
+               k ^= tail[0];
+               k *= c1;
+               k = (k << 13) | (k >> (32 - 15));
+               k *= c2;
+               h ^= k;
+       };
+
+       h ^= len;
+
+       h ^= h >> 16;
+       h *= 0x85ebca6b;
+       h ^= h >> 13;
+       h *= 0xc2b2ae35;
+       h ^= h >> 16;
+
+       return h;
+}
+
+void
+murmur128_hash (const guint8 *in, gsize len, guint64 out[])
+{
+       const guint64                    c1 = 0x87c37b91114253d5ULL;
+       const guint64                    c2 = 0x4cf5ad432745937fULL;
+       const gint                               nblocks = len / 16;
+       const guint64                   *blocks = (const guint64 *)(in);
+       const guint8                    *tail;
+       guint64                                  h1 = 0;
+       guint64                                  h2 = 0;
+       int                                      i;
+       guint64                                  k1, k2;
+
+       if (in == NULL || len == 0 || out == NULL) {
+               return;
+       }
+
+       tail = (const guint8 *)(in + (nblocks * 16));
+
+       for (i = 0; i < nblocks; i++) {
+               k1 = blocks[i*2+0];
+               k2 = blocks[i*2+1];
+
+               k1 *= c1;
+               k1  = (k1 << 31) | (k1 >> (64 - 31));
+               k1 *= c2;
+               h1 ^= k1;
+
+               h1 = (h1 << 27) | (h1 >> (64 - 27));
+               h1 += h2;
+               h1 = h1*5+0x52dce729;
+
+               k2 *= c2;
+               k2  = (k2 << 33) | (k2 >> (64 - 33));
+               k2 *= c1;
+               h2 ^= k2;
+
+               h2 = (h2 << 31) | (h2 >> (64 - 31));
+               h2 += h1;
+               h2 = h2*5+0x38495ab5;
+       }
+
+       k1 = k2 = 0;
+       switch (len & 15) {
+       case 15:
+               k2 ^= (guint64)(tail[14]) << 48;
+       case 14:
+               k2 ^= (guint64)(tail[13]) << 40;
+       case 13:
+               k2 ^= (guint64)(tail[12]) << 32;
+       case 12:
+               k2 ^= (guint64)(tail[11]) << 24;
+       case 11:
+               k2 ^= (guint64)(tail[10]) << 16;
+       case 10:
+               k2 ^= (guint64)(tail[ 9]) << 8;
+       case  9:
+               k2 ^= (guint64)(tail[ 8]) << 0;
+               k2 *= c2;
+               k2  = (k2 << 33) | (k2 >> (64 - 33));
+               k2 *= c1;
+               h2 ^= k2;
+
+       case  8:
+               k1 ^= (guint64)(tail[ 7]) << 56;
+       case  7:
+               k1 ^= (guint64)(tail[ 6]) << 48;
+       case  6:
+               k1 ^= (guint64)(tail[ 5]) << 40;
+       case  5:
+               k1 ^= (guint64)(tail[ 4]) << 32;
+       case  4:
+               k1 ^= (guint64)(tail[ 3]) << 24;
+       case  3:
+               k1 ^= (guint64)(tail[ 2]) << 16;
+       case  2:
+               k1 ^= (guint64)(tail[ 1]) << 8;
+       case  1:
+               k1 ^= (guint64)(tail[ 0]) << 0;
+               k1 *= c1;
+               k1  = (k1 << 31) | (k1 >> (64 - 31));
+               k1 *= c2;
+               h1 ^= k1;
+       };
+
+       //----------
+       // finalization
+
+       h1 ^= len;
+       h2 ^= len;
+
+       h1 += h2;
+       h2 += h1;
+
+       h1 ^= h1 >> 33;
+       h1 *= 0xff51afd7ed558ccdULL;
+       h1 ^= h1 >> 33;
+       h1 *= 0xc4ceb9fe1a85ec53ULL;
+       h1 ^= h1 >> 33;
+
+       h2 ^= h2 >> 33;
+       h2 *= 0xff51afd7ed558ccdULL;
+       h2 ^= h2 >> 33;
+       h2 *= 0xc4ceb9fe1a85ec53ULL;
+       h2 ^= h2 >> 33;
+
+       h1 += h2;
+       h2 += h1;
+
+       out[0] = h1;
+       out[1] = h2;
+}
 
 /*
  * vi:ts=4
index 961e756459320da83ec8ebbdfd53d1de9cf2e444..c8fd8158aff900da10c7f6a7eb370b264ae57890 100644 (file)
@@ -145,6 +145,13 @@ gboolean unlock_file (gint fd, gboolean async);
 guint rspamd_strcase_hash (gconstpointer key);
 gboolean rspamd_strcase_equal (gconstpointer v, gconstpointer v2);
 
+/*
+ * Hash table utility functions for case sensitive hashing
+ */
+guint rspamd_str_hash (gconstpointer key);
+gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2);
+
+
 /*
  * Hash table utility functions for hashing fixed strings
  */
@@ -317,4 +324,39 @@ void rspamd_rwlock_free (rspamd_rwlock_t *mtx);
  */
 GThread* rspamd_create_thread (const gchar *name, GThreadFunc func, gpointer data, GError **err);
 
+/**
+ * Return 32bit murmur hash value for specified input
+ * @param in input data
+ * @param len length of the input data
+ * @code
+ *  MurmurHash3 was created by Austin Appleby  in 2008. The cannonical
+ *  implementations are in C++ and placed in the public.
+ *
+ *    https://sites.google.com/site/murmurhash/
+ *
+ *  Seungyoung Kim has ported it's cannonical implementation to C language
+ *  in 2012 and published it as a part of qLibc component.
+ * @endcode
+ * @return
+ */
+guint32 murmur32_hash (const guint8 *in, gsize len);
+
+/**
+ * Return 32bit murmur hash value for specified input
+ * @param in input data
+ * @param len length of the input data
+ * @param out array of 2 guint64 variables
+ * @code
+ *  MurmurHash3 was created by Austin Appleby  in 2008. The cannonical
+ *  implementations are in C++ and placed in the public.
+ *
+ *    https://sites.google.com/site/murmurhash/
+ *
+ *  Seungyoung Kim has ported it's cannonical implementation to C language
+ *  in 2012 and published it as a part of qLibc component.
+ * @endcode
+ * @return
+ */
+void murmur128_hash (const guint8 *in, gsize len, guint64 out[]);
+
 #endif
index 976fe58c3cb3d022f67225b97b28739e75341b06..541d4f1e4a0bb2c36d9aff3a66c031f75192a432 100644 (file)
@@ -74,11 +74,11 @@ construct_task (struct rspamd_worker *worker)
        /* Add destructor for recipients list (it would be better to use anonymous function here */
        memory_pool_add_destructor (new_task->task_pool,
                        (pool_destruct_func) rcpt_destruct, new_task);
-       new_task->results = g_hash_table_new (g_str_hash, g_str_equal);
+       new_task->results = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        memory_pool_add_destructor (new_task->task_pool,
                        (pool_destruct_func) g_hash_table_destroy,
                        new_task->results);
-       new_task->re_cache = g_hash_table_new (g_str_hash, g_str_equal);
+       new_task->re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
        memory_pool_add_destructor (new_task->task_pool,
                        (pool_destruct_func) g_hash_table_destroy,
                        new_task->re_cache);