summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-08 21:21:53 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-08 21:21:53 +0400
commit8e399cdba1bba1da8c1de2b8a22efe719aa30cde (patch)
tree579eca49aa1fd0b01d4b9739418f8ed592225ef4 /src
parenta1e2b2d84b185b5430252d7ead5806944be433af (diff)
downloadrspamd-8e399cdba1bba1da8c1de2b8a22efe719aa30cde.tar.gz
rspamd-8e399cdba1bba1da8c1de2b8a22efe719aa30cde.zip
* Use murmur hash for all hashes as it is more efficient and provides more uniform distribution as glib's default one.
* Fix probability renormalization while using advanced classification.
Diffstat (limited to 'src')
-rw-r--r--src/cfg_utils.c28
-rw-r--r--src/cfg_xml.c12
-rw-r--r--src/classifiers/bayes.c38
-rw-r--r--src/client/rspamc.c14
-rw-r--r--src/expressions.c4
-rw-r--r--src/filter.c2
-rw-r--r--src/lmtp.c2
-rw-r--r--src/lua_worker.c2
-rw-r--r--src/main.c4
-rw-r--r--src/mem_pool.c2
-rw-r--r--src/plugins/regexp.c2
-rw-r--r--src/settings.c12
-rw-r--r--src/symbols_cache.c2
-rw-r--r--src/util.c218
-rw-r--r--src/util.h42
-rw-r--r--src/worker_util.c4
16 files changed, 329 insertions, 59 deletions
diff --git a/src/cfg_utils.c b/src/cfg_utils.c
index 7dfe54462..37f28c822 100644
--- a/src/cfg_utils.c
+++ b/src/cfg_utils.c
@@ -215,14 +215,14 @@ init_defaults (struct config_file *cfg)
cfg->max_diff = 20480;
cfg->max_statfile_size = DEFAULT_STATFILE_SIZE;
- cfg->modules_opts = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->variables = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->metrics = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->metrics_symbols = g_hash_table_new (g_str_hash, g_str_equal);
+ cfg->modules_opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ cfg->variables = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ cfg->metrics = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ cfg->c_modules = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ cfg->composite_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ cfg->classifiers_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ cfg->cfg_params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ cfg->metrics_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
cfg->map_timeout = DEFAULT_MAP_TIMEOUT;
@@ -825,11 +825,11 @@ check_classifier_conf (struct config_file *cfg, struct classifier_config *c)
c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config));
}
if (c->opts == NULL) {
- c->opts = g_hash_table_new (g_str_hash, g_str_equal);
+ c->opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->opts);
}
if (c->labels == NULL) {
- c->labels = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, (GDestroyNotify)g_list_free);
+ c->labels = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, NULL, (GDestroyNotify)g_list_free);
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->labels);
}
@@ -843,7 +843,7 @@ check_statfile_conf (struct config_file *cfg, struct statfile *c)
c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
if (c->opts == NULL) {
- c->opts = g_hash_table_new (g_str_hash, g_str_equal);
+ c->opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->opts);
}
@@ -857,8 +857,8 @@ check_metric_conf (struct config_file *cfg, struct metric *c)
c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct metric));
c->action = METRIC_ACTION_REJECT;
c->grow_factor = 1.0;
- c->symbols = g_hash_table_new (g_str_hash, g_str_equal);
- c->descriptions = g_hash_table_new (g_str_hash, g_str_equal);
+ c->symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ c->descriptions = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->symbols);
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func) g_hash_table_destroy, c->descriptions);
}
@@ -871,7 +871,7 @@ check_worker_conf (struct config_file *cfg, struct worker_conf *c)
{
if (c == NULL) {
c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct worker_conf));
- c->params = g_hash_table_new (g_str_hash, g_str_equal);
+ c->params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
c->active_workers = g_queue_new ();
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->params);
memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_queue_free, c->active_workers);
diff --git a/src/cfg_xml.c b/src/cfg_xml.c
index 99aad12ab..277172f2b 100644
--- a/src/cfg_xml.c
+++ b/src/cfg_xml.c
@@ -2310,10 +2310,10 @@ register_module_opt (const gchar *mname, const gchar *optname, enum module_opt_t
GHashTable *module;
if (module_options == NULL) {
- module_options = g_hash_table_new (g_str_hash, g_str_equal);
+ module_options = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
}
if ((module = g_hash_table_lookup (module_options, mname)) == NULL) {
- module = g_hash_table_new (g_str_hash, g_str_equal);
+ module = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
g_hash_table_insert (module_options, (char *)mname, module);
}
if ((param = g_hash_table_lookup (module, optname)) == NULL) {
@@ -2348,7 +2348,7 @@ register_worker_opt (gint wtype, const gchar *optname, element_handler_func func
worker_options = g_hash_table_new (g_int_hash, g_int_equal);
}
if ((worker = g_hash_table_lookup (worker_options, &wtype)) == NULL) {
- worker = g_hash_table_new (g_str_hash, g_str_equal);
+ worker = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
new_key = g_malloc (sizeof (gint));
*new_key = wtype;
g_hash_table_insert (worker_options, new_key, worker);
@@ -2372,10 +2372,10 @@ register_classifier_opt (const gchar *ctype, const gchar *optname)
GHashTable *classifier;
if (classifier_options == NULL) {
- classifier_options = g_hash_table_new (g_str_hash, g_str_equal);
+ classifier_options = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
}
if ((classifier = g_hash_table_lookup (classifier_options, ctype)) == NULL) {
- classifier = g_hash_table_new (g_str_hash, g_str_equal);
+ classifier = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
g_hash_table_insert (classifier_options, (char *)ctype, classifier);
}
if ((param = g_hash_table_lookup (classifier, optname)) == NULL) {
@@ -2406,7 +2406,7 @@ register_subparser (const gchar *tag, enum xml_read_state state, const GMarkupPa
struct xml_subparser *subparser;
if (subparsers == NULL) {
- subparsers = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
+ subparsers = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
}
subparser = g_malloc (sizeof (struct xml_subparser));
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index a80bbe0ba..0e68f5d72 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -182,10 +182,10 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
{
struct bayes_callback_data data;
gchar *value;
- gint nodes, i = 0, cnt, best_num = 0;
+ gint nodes, i = 0, cnt, best_num_spam = 0, best_num_ham = 0;
gint minnodes;
guint64 rev, total_learns = 0;
- double best = 0;
+ double best_spam = 0., best_ham = 0., total_spam = 0., total_ham = 0.;
struct statfile *st;
stat_file_t *file;
GList *cur;
@@ -262,17 +262,37 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
for (i = 0; i < cnt; i ++) {
debug_task ("got probability for symbol %s: %.2f", data.statfiles[i].st->symbol, data.statfiles[i].post_probability);
- if (data.statfiles[i].post_probability > best) {
- best = data.statfiles[i].post_probability;
- best_num = i;
+
+ if (data.statfiles[i].st->is_spam) {
+ total_spam += data.statfiles[i].post_probability;
+ if (data.statfiles[i].post_probability > best_spam) {
+ best_spam = data.statfiles[i].post_probability;
+ best_num_spam = i;
+ }
+ }
+ else {
+ total_ham += data.statfiles[i].post_probability;
+ if (data.statfiles[i].post_probability > best_ham) {
+ best_ham = data.statfiles[i].post_probability;
+ best_num_ham = i;
+ }
}
}
- if (best > 0.5) {
+
+ if (total_ham > 0.5 || total_spam > 0.5) {
+
sumbuf = memory_pool_alloc (task->task_pool, 32);
- rspamd_snprintf (sumbuf, 32, "%.2f", best);
- cur = g_list_prepend (NULL, sumbuf);
- insert_result (task, data.statfiles[best_num].st->symbol, best, cur);
+ if (total_ham > total_spam) {
+ rspamd_snprintf (sumbuf, 32, "%.2f", total_ham);
+ cur = g_list_prepend (NULL, sumbuf);
+ insert_result (task, data.statfiles[best_num_ham].st->symbol, total_ham, cur);
+ }
+ else {
+ rspamd_snprintf (sumbuf, 32, "%.2f", total_spam);
+ cur = g_list_prepend (NULL, sumbuf);
+ insert_result (task, data.statfiles[best_num_spam].st->symbol, total_spam, cur);
+ }
}
g_free (data.statfiles);
diff --git a/src/client/rspamc.c b/src/client/rspamc.c
index aae8345d5..74423bbf3 100644
--- a/src/client/rspamc.c
+++ b/src/client/rspamc.c
@@ -352,7 +352,7 @@ scan_rspamd_stdin (void)
GHashTable *opts;
/* Init options hash */
- opts = g_hash_table_new (g_str_hash, g_str_equal);
+ opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
add_options (opts);
/* Add server */
add_rspamd_server (FALSE);
@@ -390,7 +390,7 @@ scan_rspamd_file (const gchar *file)
/* Add server */
add_rspamd_server (FALSE);
/* Init options hash */
- opts = g_hash_table_new (g_str_hash, g_str_equal);
+ opts = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
add_options (opts);
res = rspamd_scan_file (client, file, opts, &err);
g_hash_table_destroy (opts);
@@ -435,7 +435,7 @@ learn_rspamd_stdin (gboolean is_spam)
}
}
- params = g_hash_table_new (g_str_hash, g_str_equal);
+ params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
g_hash_table_insert (params, "Classifier", classifier);
results = rspamd_controller_command_memory (client, is_spam ? "learn_spam" : "learn_ham", password, params, in_buf, r, &err);
@@ -482,7 +482,7 @@ learn_rspamd_file (gboolean is_spam, const gchar *file)
/* Add server */
add_rspamd_server (TRUE);
- params = g_hash_table_new (g_str_hash, g_str_equal);
+ params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
g_hash_table_insert (params, "Classifier", classifier);
results = rspamd_controller_command_file (client, is_spam ? "learn_spam" : "learn_ham", password, params, file, &err);
@@ -525,7 +525,7 @@ fuzzy_rspamd_stdin (gboolean delete)
gchar valuebuf[sizeof("65535")], flagbuf[sizeof("65535")];
struct rspamd_controller_result *res;
- params = g_hash_table_new (g_str_hash, g_str_equal);
+ params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
rspamd_snprintf (valuebuf, sizeof (valuebuf), "%d", weight);
rspamd_snprintf (flagbuf, sizeof (flagbuf), "%d", flag);
g_hash_table_insert (params, "Value", valuebuf);
@@ -588,7 +588,7 @@ fuzzy_rspamd_file (const gchar *file, gboolean delete)
/* Add server */
add_rspamd_server (TRUE);
- params = g_hash_table_new (g_str_hash, g_str_equal);
+ params = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
rspamd_snprintf (valuebuf, sizeof (valuebuf), "%d", weight);
rspamd_snprintf (flagbuf, sizeof (flagbuf), "%d", flag);
g_hash_table_insert (params, "Value", valuebuf);
@@ -671,7 +671,7 @@ main (gint argc, gchar **argv, gchar **env)
GHashTable *kwattrs;
- kwattrs = g_hash_table_new (g_str_hash, g_str_equal);
+ kwattrs = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
read_cmd_line (&argc, &argv);
diff --git a/src/expressions.c b/src/expressions.c
index 1f04843bd..f3356a218 100644
--- a/src/expressions.c
+++ b/src/expressions.c
@@ -87,7 +87,7 @@ re_cache_check (const gchar *line, memory_pool_t *pool)
re_cache = memory_pool_get_variable (pool, "re_cache");
if (re_cache == NULL) {
- re_cache = g_hash_table_new (g_str_hash, g_str_equal);
+ re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
memory_pool_set_variable (pool, "re_cache", re_cache, (pool_destruct_func)g_hash_table_destroy);
return NULL;
}
@@ -102,7 +102,7 @@ re_cache_add (const gchar *line, void *pointer, memory_pool_t *pool)
re_cache = memory_pool_get_variable (pool, "re_cache");
if (re_cache == NULL) {
- re_cache = g_hash_table_new (g_str_hash, g_str_equal);
+ re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
memory_pool_set_variable (pool, "re_cache", re_cache, (pool_destruct_func)g_hash_table_destroy);
}
diff --git a/src/filter.c b/src/filter.c
index d26a34af0..bbe3b4678 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -75,7 +75,7 @@ insert_metric_result (struct worker_task *task, struct metric *metric, const gch
if (metric_res == NULL) {
/* Create new metric chain */
metric_res = memory_pool_alloc (task->task_pool, sizeof (struct metric_result));
- metric_res->symbols = g_hash_table_new (g_str_hash, g_str_equal);
+ metric_res->symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
metric_res->checked = FALSE;
memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_hash_table_unref, metric_res->symbols);
metric_res->metric = metric;
diff --git a/src/lmtp.c b/src/lmtp.c
index 4e7373158..67743e92a 100644
--- a/src/lmtp.c
+++ b/src/lmtp.c
@@ -271,7 +271,7 @@ accept_socket (gint fd, short what, void *arg)
new_task->task_pool = memory_pool_new (memory_pool_get_size ());
/* Add destructor for recipients list (it would be better to use anonymous function here */
memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) rcpt_destruct, new_task);
- new_task->results = g_hash_table_new (g_str_hash, g_str_equal);
+ new_task->results = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
new_task->ev_base = worker->ctx;
memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) g_hash_table_destroy, new_task->results);
worker->srv->stat->connections_count++;
diff --git a/src/lua_worker.c b/src/lua_worker.c
index c75d275a6..c4276968a 100644
--- a/src/lua_worker.c
+++ b/src/lua_worker.c
@@ -410,7 +410,7 @@ init_lua_worker (void)
type = g_quark_try_string ("lua");
ctx = g_malloc0 (sizeof (struct rspamd_lua_worker_ctx));
- ctx->params = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, (GDestroyNotify)g_list_free);
+ ctx->params = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, (GDestroyNotify)g_list_free);
register_worker_opt (type, "file", xml_handle_string, ctx, G_STRUCT_OFFSET (struct rspamd_lua_worker_ctx, file));
diff --git a/src/main.c b/src/main.c
index a5cad0ba9..d28672e83 100644
--- a/src/main.c
+++ b/src/main.c
@@ -878,7 +878,7 @@ main (gint argc, gchar **argv, gchar **env)
}
/* Init counters */
- rspamd_main->counters = rspamd_hash_new_shared (rspamd_main->server_pool, g_str_hash, g_str_equal, 64);
+ rspamd_main->counters = rspamd_hash_new_shared (rspamd_main->server_pool, rspamd_str_hash, rspamd_str_equal, 64);
/* Init listen sockets hash */
listen_sockets = g_hash_table_new (g_direct_hash, g_direct_equal);
@@ -893,7 +893,7 @@ main (gint argc, gchar **argv, gchar **env)
rspamd_main->cfg->cache = g_new0 (struct symbols_cache, 1);
rspamd_main->cfg->cache->static_pool = memory_pool_new (memory_pool_get_size ());
rspamd_main->cfg->cache->cfg = rspamd_main->cfg;
- rspamd_main->cfg->cache->items_by_symbol = g_hash_table_new (g_str_hash, g_str_equal);
+ rspamd_main->cfg->cache->items_by_symbol = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
/* Load config */
if (! load_rspamd_config (rspamd_main->cfg, TRUE)) {
diff --git a/src/mem_pool.c b/src/mem_pool.c
index f7636e538..5ab582352 100644
--- a/src/mem_pool.c
+++ b/src/mem_pool.c
@@ -746,7 +746,7 @@ void
memory_pool_set_variable (memory_pool_t *pool, const gchar *name, gpointer value, pool_destruct_func destructor)
{
if (pool->variables == NULL) {
- pool->variables = g_hash_table_new (g_str_hash, g_str_equal);
+ pool->variables = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
}
g_hash_table_insert (pool->variables, memory_pool_strdup (pool, name), value);
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index f5c94bd1f..ebb4ff648 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -514,7 +514,7 @@ regexp_module_init (struct config_file *cfg, struct module_ctx **ctx)
regexp_module_ctx->filter = regexp_common_filter;
regexp_module_ctx->regexp_pool = memory_pool_new (memory_pool_get_size ());
regexp_module_ctx->dynamic_pool = NULL;
- regexp_module_ctx->autolearn_symbols = g_hash_table_new (g_str_hash, g_str_equal);
+ regexp_module_ctx->autolearn_symbols = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
regexp_module_ctx->workers = NULL;
*ctx = (struct module_ctx *)regexp_module_ctx;
diff --git a/src/settings.c b/src/settings.c
index d21fe02d0..0d856bf3f 100644
--- a/src/settings.c
+++ b/src/settings.c
@@ -81,12 +81,12 @@ settings_ref (struct rspamd_settings *s)
{
if (s == NULL) {
s = g_slice_alloc (sizeof (struct rspamd_settings));
- s->metric_scores = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
- s->reject_scores = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
- s->metric_actions = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, settings_actions_free);
- s->factors = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
- s->whitelist = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
- s->blacklist = g_hash_table_new_full (g_str_hash, g_str_equal, g_free, g_free);
+ s->metric_scores = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+ s->reject_scores = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+ s->metric_actions = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, settings_actions_free);
+ s->factors = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+ s->whitelist = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
+ s->blacklist = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal, g_free, g_free);
s->statfile_alias = NULL;
s->want_spam = FALSE;
s->ref_count = 1;
diff --git a/src/symbols_cache.c b/src/symbols_cache.c
index 9b8d57026..b273ea1c9 100644
--- a/src/symbols_cache.c
+++ b/src/symbols_cache.c
@@ -271,7 +271,7 @@ register_symbol_common (struct symbols_cache **cache, const gchar *name, double
pcache = g_new0 (struct symbols_cache, 1);
*cache = pcache;
pcache->static_pool = memory_pool_new (memory_pool_get_size ());
- pcache->items_by_symbol = g_hash_table_new (g_str_hash, g_str_equal);
+ pcache->items_by_symbol = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
}
item = memory_pool_alloc0 (pcache->static_pool, sizeof (struct cache_item));
diff --git a/src/util.c b/src/util.c
index 30c85ddf2..707f4c449 100644
--- a/src/util.c
+++ b/src/util.c
@@ -947,6 +947,7 @@ calculate_check_time (struct timeval *begin, gint resolution)
# define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x))
#endif
+
gboolean
rspamd_strcase_equal (gconstpointer v, gconstpointer v2)
{
@@ -962,16 +963,43 @@ guint
rspamd_strcase_hash (gconstpointer key)
{
const gchar *p = key;
- guint h = 0;
+ gchar buf[256];
+ guint h = 0, i = 0;
+
while (*p != '\0') {
- h = (h << 5) - h + g_tolower (*p);
+ buf[i] = g_ascii_tolower (*p);
+ i++;
p++;
+ if (i == sizeof (buf)) {
+ h ^= murmur32_hash (buf, i);
+ i = 0;
+ }
+ }
+
+ if (i > 0) {
+ h ^= murmur32_hash (buf, i);
}
return h;
}
+guint
+rspamd_str_hash (gconstpointer key)
+{
+ gsize len;
+
+ len = strlen ((const gchar *)key);
+
+ return murmur32_hash (key, len);
+}
+
+gboolean
+rspamd_str_equal (gconstpointer v, gconstpointer v2)
+{
+ return strcmp ((const gchar *)v, (const gchar *)v2) == 0;
+}
+
gboolean
fstr_strcase_equal (gconstpointer v, gconstpointer v2)
{
@@ -987,14 +1015,24 @@ fstr_strcase_equal (gconstpointer v, gconstpointer v2)
guint
fstr_strcase_hash (gconstpointer key)
{
- const f_str_t *f = key;
+ const f_str_t *f = key;
const gchar *p;
- guint h = 0;
+ guint h = 0, i = 0;
+ gchar buf[256];
p = f->begin;
while (p - f->begin < (gint)f->len) {
- h = (h << 5) - h + g_tolower (*p);
+ buf[i] = g_ascii_tolower (*p);
+ i++;
p++;
+ if (i == sizeof (buf)) {
+ h ^= murmur32_hash (buf, i);
+ i = 0;
+ }
+ }
+
+ if (i > 0) {
+ h ^= murmur32_hash (buf, i);
}
return h;
@@ -1560,6 +1598,176 @@ rspamd_create_thread (const gchar *name, GThreadFunc func, gpointer data, GError
return new;
}
+guint32
+murmur32_hash (const guint8 *in, gsize len)
+{
+
+
+ const guint32 c1 = 0xcc9e2d51;
+ const guint32 c2 = 0x1b873593;
+
+ const int nblocks = len / 4;
+ const guint32 *blocks = (const guint32 *)(in);
+ const guint8 *tail;
+ guint32 h = 0;
+ gint i;
+ guint32 k;
+
+ if (in == NULL || len == 0) {
+ return 0;
+ }
+
+ tail = (const guint8 *)(in + (nblocks * 4));
+
+ for (i = 0; i < nblocks; i++) {
+ k = blocks[i];
+
+ k *= c1;
+ k = (k << 15) | (k >> (32 - 15));
+ k *= c2;
+
+ h ^= k;
+ h = (h << 13) | (h >> (32 - 13));
+ h = (h * 5) + 0xe6546b64;
+ }
+
+ k = 0;
+ switch (len & 3) {
+ case 3:
+ k ^= tail[2] << 16;
+ case 2:
+ k ^= tail[1] << 8;
+ case 1:
+ k ^= tail[0];
+ k *= c1;
+ k = (k << 13) | (k >> (32 - 15));
+ k *= c2;
+ h ^= k;
+ };
+
+ h ^= len;
+
+ h ^= h >> 16;
+ h *= 0x85ebca6b;
+ h ^= h >> 13;
+ h *= 0xc2b2ae35;
+ h ^= h >> 16;
+
+ return h;
+}
+
+void
+murmur128_hash (const guint8 *in, gsize len, guint64 out[])
+{
+ const guint64 c1 = 0x87c37b91114253d5ULL;
+ const guint64 c2 = 0x4cf5ad432745937fULL;
+ const gint nblocks = len / 16;
+ const guint64 *blocks = (const guint64 *)(in);
+ const guint8 *tail;
+ guint64 h1 = 0;
+ guint64 h2 = 0;
+ int i;
+ guint64 k1, k2;
+
+ if (in == NULL || len == 0 || out == NULL) {
+ return;
+ }
+
+ tail = (const guint8 *)(in + (nblocks * 16));
+
+ for (i = 0; i < nblocks; i++) {
+ k1 = blocks[i*2+0];
+ k2 = blocks[i*2+1];
+
+ k1 *= c1;
+ k1 = (k1 << 31) | (k1 >> (64 - 31));
+ k1 *= c2;
+ h1 ^= k1;
+
+ h1 = (h1 << 27) | (h1 >> (64 - 27));
+ h1 += h2;
+ h1 = h1*5+0x52dce729;
+
+ k2 *= c2;
+ k2 = (k2 << 33) | (k2 >> (64 - 33));
+ k2 *= c1;
+ h2 ^= k2;
+
+ h2 = (h2 << 31) | (h2 >> (64 - 31));
+ h2 += h1;
+ h2 = h2*5+0x38495ab5;
+ }
+
+ k1 = k2 = 0;
+ switch (len & 15) {
+ case 15:
+ k2 ^= (guint64)(tail[14]) << 48;
+ case 14:
+ k2 ^= (guint64)(tail[13]) << 40;
+ case 13:
+ k2 ^= (guint64)(tail[12]) << 32;
+ case 12:
+ k2 ^= (guint64)(tail[11]) << 24;
+ case 11:
+ k2 ^= (guint64)(tail[10]) << 16;
+ case 10:
+ k2 ^= (guint64)(tail[ 9]) << 8;
+ case 9:
+ k2 ^= (guint64)(tail[ 8]) << 0;
+ k2 *= c2;
+ k2 = (k2 << 33) | (k2 >> (64 - 33));
+ k2 *= c1;
+ h2 ^= k2;
+
+ case 8:
+ k1 ^= (guint64)(tail[ 7]) << 56;
+ case 7:
+ k1 ^= (guint64)(tail[ 6]) << 48;
+ case 6:
+ k1 ^= (guint64)(tail[ 5]) << 40;
+ case 5:
+ k1 ^= (guint64)(tail[ 4]) << 32;
+ case 4:
+ k1 ^= (guint64)(tail[ 3]) << 24;
+ case 3:
+ k1 ^= (guint64)(tail[ 2]) << 16;
+ case 2:
+ k1 ^= (guint64)(tail[ 1]) << 8;
+ case 1:
+ k1 ^= (guint64)(tail[ 0]) << 0;
+ k1 *= c1;
+ k1 = (k1 << 31) | (k1 >> (64 - 31));
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ //----------
+ // finalization
+
+ h1 ^= len;
+ h2 ^= len;
+
+ h1 += h2;
+ h2 += h1;
+
+ h1 ^= h1 >> 33;
+ h1 *= 0xff51afd7ed558ccdULL;
+ h1 ^= h1 >> 33;
+ h1 *= 0xc4ceb9fe1a85ec53ULL;
+ h1 ^= h1 >> 33;
+
+ h2 ^= h2 >> 33;
+ h2 *= 0xff51afd7ed558ccdULL;
+ h2 ^= h2 >> 33;
+ h2 *= 0xc4ceb9fe1a85ec53ULL;
+ h2 ^= h2 >> 33;
+
+ h1 += h2;
+ h2 += h1;
+
+ out[0] = h1;
+ out[1] = h2;
+}
/*
* vi:ts=4
diff --git a/src/util.h b/src/util.h
index 961e75645..c8fd8158a 100644
--- a/src/util.h
+++ b/src/util.h
@@ -146,6 +146,13 @@ guint rspamd_strcase_hash (gconstpointer key);
gboolean rspamd_strcase_equal (gconstpointer v, gconstpointer v2);
/*
+ * Hash table utility functions for case sensitive hashing
+ */
+guint rspamd_str_hash (gconstpointer key);
+gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2);
+
+
+/*
* Hash table utility functions for hashing fixed strings
*/
guint fstr_strcase_hash (gconstpointer key);
@@ -317,4 +324,39 @@ void rspamd_rwlock_free (rspamd_rwlock_t *mtx);
*/
GThread* rspamd_create_thread (const gchar *name, GThreadFunc func, gpointer data, GError **err);
+/**
+ * Return 32bit murmur hash value for specified input
+ * @param in input data
+ * @param len length of the input data
+ * @code
+ * MurmurHash3 was created by Austin Appleby in 2008. The cannonical
+ * implementations are in C++ and placed in the public.
+ *
+ * https://sites.google.com/site/murmurhash/
+ *
+ * Seungyoung Kim has ported it's cannonical implementation to C language
+ * in 2012 and published it as a part of qLibc component.
+ * @endcode
+ * @return
+ */
+guint32 murmur32_hash (const guint8 *in, gsize len);
+
+/**
+ * Return 32bit murmur hash value for specified input
+ * @param in input data
+ * @param len length of the input data
+ * @param out array of 2 guint64 variables
+ * @code
+ * MurmurHash3 was created by Austin Appleby in 2008. The cannonical
+ * implementations are in C++ and placed in the public.
+ *
+ * https://sites.google.com/site/murmurhash/
+ *
+ * Seungyoung Kim has ported it's cannonical implementation to C language
+ * in 2012 and published it as a part of qLibc component.
+ * @endcode
+ * @return
+ */
+void murmur128_hash (const guint8 *in, gsize len, guint64 out[]);
+
#endif
diff --git a/src/worker_util.c b/src/worker_util.c
index 976fe58c3..541d4f1e4 100644
--- a/src/worker_util.c
+++ b/src/worker_util.c
@@ -74,11 +74,11 @@ construct_task (struct rspamd_worker *worker)
/* Add destructor for recipients list (it would be better to use anonymous function here */
memory_pool_add_destructor (new_task->task_pool,
(pool_destruct_func) rcpt_destruct, new_task);
- new_task->results = g_hash_table_new (g_str_hash, g_str_equal);
+ new_task->results = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
memory_pool_add_destructor (new_task->task_pool,
(pool_destruct_func) g_hash_table_destroy,
new_task->results);
- new_task->re_cache = g_hash_table_new (g_str_hash, g_str_equal);
+ new_task->re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
memory_pool_add_destructor (new_task->task_pool,
(pool_destruct_func) g_hash_table_destroy,
new_task->re_cache);