diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2013-12-04 13:41:26 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2013-12-04 13:41:26 +0000 |
commit | 703fb40d6e37c5337a23694bce1bb114b7d7516a (patch) | |
tree | ebdba3aaa9018a01c21702c0785f00c609e77e21 | |
parent | d0314f0ca99d2485054b692de565729f4e961306 (diff) | |
download | rspamd-703fb40d6e37c5337a23694bce1bb114b7d7516a.tar.gz rspamd-703fb40d6e37c5337a23694bce1bb114b7d7516a.zip |
Rework fuzzy check module.
- Now all checks are organized to rules.
- Allow to specify read_only rules to avoid problems on learning.
- Use better normalizer for fuzzy module and it now returns values
from 0 to 1.0 (like bayes does).
- Update configuration accordingly.
- Drop legacy configuration support.
- Detect tanh as well and provide some reasonable (linear) fallback.
-rw-r--r-- | CMakeLists.txt | 5 | ||||
-rw-r--r-- | conf/metrics.conf | 8 | ||||
-rw-r--r-- | conf/modules.conf | 38 | ||||
-rw-r--r-- | config.h.in | 1 | ||||
-rw-r--r-- | src/cfg_utils.c | 4 | ||||
-rw-r--r-- | src/plugins/fuzzy_check.c | 726 |
6 files changed, 405 insertions, 377 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 8937fe937..6e42fa9e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,10 @@ PROJECT(rspamd C) SET(RSPAMD_VERSION_MAJOR 0) SET(RSPAMD_VERSION_MINOR 6) -SET(RSPAMD_VERSION_PATCH 1) +SET(RSPAMD_VERSION_PATCH 2) SET(RSPAMD_VERSION "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}") -SET(RSPAMD_MASTER_SITE_URL "http://bitbucket.org/vstakhov/rspamd") +SET(RSPAMD_MASTER_SITE_URL "https://rspamd.com") IF(NOT RSPAMD_USER) SET(RSPAMD_USER "nobody") @@ -862,6 +862,7 @@ CHECK_FUNCTION_EXISTS(wait4 HAVE_WAIT4) CHECK_FUNCTION_EXISTS(waitpid HAVE_WAITPID) CHECK_FUNCTION_EXISTS(flock HAVE_FLOCK) CHECK_FUNCTION_EXISTS(tanhl HAVE_TANHL) +CHECK_FUNCTION_EXISTS(tanh HAVE_TANH) CHECK_FUNCTION_EXISTS(expl HAVE_EXPL) CHECK_FUNCTION_EXISTS(exp2l HAVE_EXP2L) CHECK_FUNCTION_EXISTS(sendfile HAVE_SENDFILE) diff --git a/conf/metrics.conf b/conf/metrics.conf index 727bea6da..9dfdb1c00 100644 --- a/conf/metrics.conf +++ b/conf/metrics.conf @@ -408,22 +408,22 @@ metric { name = "BAYES_HAM"; } symbol { - weight = 1.0; + weight = 10.0; description = "Generic fuzzy hash match"; name = "R_FUZZY"; } symbol { - weight = 1.0; + weight = 10.0; description = "Denied fuzzy hash"; name = "FUZZY_DENIED"; } symbol { - weight = 1.0; + weight = 5.0; description = "Probable fuzzy hash"; name = "FUZZY_PROB"; } symbol { - weight = 1.0; + weight = -2.1; description = "Whitelisted fuzzy hash"; name = "FUZZY_WHITE"; } diff --git a/conf/modules.conf b/conf/modules.conf index 2b7d5b8b8..b951d863f 100644 --- a/conf/modules.conf +++ b/conf/modules.conf @@ -1,24 +1,26 @@ # Rspamd modules configuration fuzzy_check { - servers = "highsecure.ru:11335"; - symbol = "R_FUZZY"; min_bytes = 300; - max_score = 10; - mime_types = "application/pdf"; - fuzzy_map = { - FUZZY_DENIED { - weight = 10.0; - flag = 1 - } - FUZZY_PROB { - weight = 5.0; - flag = 2 - } - FUZZY_WHITE { - weight = -2.1; - flag = 3 - } - } + rule { + servers = "highsecure.ru:11335"; + symbol = "R_FUZZY"; + mime_types = "application/pdf"; + max_score = 10; + fuzzy_map = { + FUZZY_DENIED { + max_score = 10.0; + flag = 1 + } + FUZZY_PROB { + max_score = 5.0; + flag = 2 + } + FUZZY_WHITE { + max_score = 5.0; + flag = 3 + } + } + } } forged_recipients { symbol_sender = "FORGED_SENDER"; diff --git a/config.h.in b/config.h.in index c7e44d846..e387d5d1c 100644 --- a/config.h.in +++ b/config.h.in @@ -145,6 +145,7 @@ #cmakedefine HAVE_FLOCK 1 #cmakedefine HAVE_TANHL 1 +#cmakedefine HAVE_TANH 1 #cmakedefine HAVE_EXPL 1 #cmakedefine HAVE_EXP2L 1 diff --git a/src/cfg_utils.c b/src/cfg_utils.c index 765ff7f34..d015d60a0 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -635,12 +635,14 @@ internal_normalizer_func (struct config_file *cfg, long double score, void *data } #ifdef HAVE_TANHL return max * tanhl (score / max); -#else +#elif defined(HAVE_TANHL) /* * As some implementations of libm does not support tanhl, try to use * tanh */ return max * tanh ((double) (score / max)); +#else + return score < max ? score / max : max; #endif } diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index b79ecc452..9d56806d6 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -60,72 +60,80 @@ #define DEFAULT_PORT 11335 struct storage_server { - struct upstream up; - gchar *name; - gchar *addr; - guint16 port; + struct upstream up; + gchar *name; + gchar *addr; + guint16 port; }; struct fuzzy_mapping { - guint64 fuzzy_flag; - const gchar *symbol; + guint64 fuzzy_flag; + const gchar *symbol; double weight; }; struct fuzzy_mime_type { - gchar *type; - gchar *subtype; + gchar *type; + gchar *subtype; +}; + +struct fuzzy_rule { + struct storage_server *servers; + gint servers_num; + const gchar *symbol; + GHashTable *mappings; + GList *mime_types; + double max_score; + gboolean read_only; }; struct fuzzy_ctx { - gint (*filter) (struct worker_task * task); - const gchar *symbol; - struct storage_server *servers; - gint servers_num; - memory_pool_t *fuzzy_pool; - double max_score; - guint32 min_hash_len; - radix_tree_t *whitelist; - GHashTable *mappings; - GList *mime_types; - guint32 min_bytes; - guint32 min_height; - guint32 min_width; - guint32 io_timeout; + gint (*filter) (struct worker_task * task); + memory_pool_t *fuzzy_pool; + GList *fuzzy_rules; + const gchar *default_symbol; + guint32 min_hash_len; + radix_tree_t *whitelist; + guint32 min_bytes; + guint32 min_height; + guint32 min_width; + guint32 io_timeout; }; struct fuzzy_client_session { - gint state; - fuzzy_hash_t *h; - struct event ev; - struct timeval tv; - struct worker_task *task; - struct storage_server *server; - gint fd; + gint state; + fuzzy_hash_t *h; + struct event ev; + struct timeval tv; + struct worker_task *task; + struct storage_server *server; + struct fuzzy_rule *rule; + gint fd; }; struct fuzzy_learn_session { - struct event ev; - fuzzy_hash_t *h; - gint cmd; - gint value; - gint flag; - gint *saved; - GError **err; - struct timeval tv; - struct controller_session *session; - struct storage_server *server; - struct worker_task *task; - gint fd; + struct event ev; + fuzzy_hash_t *h; + gint cmd; + gint value; + gint flag; + gint *saved; + GError **err; + struct timeval tv; + struct controller_session *session; + struct storage_server *server; + struct fuzzy_rule *rule; + struct worker_task *task; + gint fd; }; -static struct fuzzy_ctx *fuzzy_module_ctx = NULL; -static const gchar hex_digits[] = "0123456789abcdef"; +static struct fuzzy_ctx *fuzzy_module_ctx = NULL; +static const gchar hex_digits[] = "0123456789abcdef"; -static gint fuzzy_mime_filter (struct worker_task *task); -static void fuzzy_symbol_callback (struct worker_task *task, void *unused); -static void fuzzy_add_handler (gchar **args, struct controller_session *session); -static void fuzzy_delete_handler (gchar **args, struct controller_session *session); +static void fuzzy_symbol_callback (struct worker_task *task, void *unused); +static void fuzzy_add_handler (gchar **args, struct controller_session *session); +static void fuzzy_delete_handler (gchar **args, + struct controller_session *session); /* Initialization */ gint fuzzy_check_module_init (struct config_file *cfg, struct module_ctx **ctx); @@ -139,81 +147,35 @@ module_t fuzzy_check_module = { fuzzy_check_module_reconfig }; -/* Flags string is in format <numeric_flag>:<SYMBOL>:weight[, <numeric_flag>:<SYMBOL>:weight...] */ static void -parse_flags_string_old (struct config_file *cfg, const gchar *str) -{ - gchar **strvec, *item, *err_str, **map_str; - gint num, i, t; - struct fuzzy_mapping *map; - - strvec = g_strsplit_set (str, ", ;", 0); - num = g_strv_length (strvec); - - for (i = 0; i < num; i ++) { - item = strvec[i]; - map_str = g_strsplit_set (item, ":", 3); - t = g_strv_length (map_str); - if (t != 3 && t != 2) { - msg_err ("invalid fuzzy mapping: %s", item); - } - else { - map = memory_pool_alloc (fuzzy_module_ctx->fuzzy_pool, sizeof (struct fuzzy_mapping)); - map->symbol = memory_pool_strdup (fuzzy_module_ctx->fuzzy_pool, map_str[1]); - - errno = 0; - map->fuzzy_flag = strtol (map_str[0], &err_str, 10); - if (errno != 0 || (err_str && *err_str != '\0')) { - msg_info ("cannot parse flag %s: %s", map_str[0], strerror (errno)); - continue; - } - else if (t == 2) { - /* Weight is skipped in definition */ - map->weight = fuzzy_module_ctx->max_score; - } - else { - map->weight = strtol (map_str[2], &err_str, 10); - - } - /* Add flag to hash table */ - g_hash_table_insert (fuzzy_module_ctx->mappings, GINT_TO_POINTER(map->fuzzy_flag), map); - register_virtual_symbol (&cfg->cache, map->symbol, map->weight); - } - g_strfreev (map_str); - } - - g_strfreev (strvec); -} - -static void -parse_flags_string (struct config_file *cfg, ucl_object_t *val) +parse_flags_string (struct fuzzy_rule *rule, struct config_file *cfg, ucl_object_t *val) { ucl_object_t *elt; struct fuzzy_mapping *map; const gchar *sym = NULL; if (val->type == UCL_STRING) { - parse_flags_string_old (cfg, ucl_obj_tostring (val)); + msg_err ("string mappings are deprecated and no longer supported, use new style configuration"); } else if (val->type == UCL_OBJECT) { - elt = ucl_obj_get_key (val, "symbol"); + elt = ucl_object_find_key (val, "symbol"); if (elt == NULL || !ucl_object_tostring_safe (elt, &sym)) { sym = ucl_object_key (val); } if (sym != NULL) { map = memory_pool_alloc (fuzzy_module_ctx->fuzzy_pool, sizeof (struct fuzzy_mapping)); map->symbol = sym; - elt = ucl_obj_get_key (val, "flag"); + elt = ucl_object_find_key (val, "flag"); if (elt != NULL && ucl_obj_toint_safe (elt, &map->fuzzy_flag)) { - elt = ucl_obj_get_key (val, "weight"); + elt = ucl_object_find_key (val, "weight"); if (elt != NULL) { map->weight = ucl_obj_todouble (elt); } else { - map->weight = fuzzy_module_ctx->max_score; + map->weight = rule->max_score; } /* Add flag to hash table */ - g_hash_table_insert (fuzzy_module_ctx->mappings, GINT_TO_POINTER (map->fuzzy_flag), map); + g_hash_table_insert (rule->mappings, GINT_TO_POINTER (map->fuzzy_flag), map); register_virtual_symbol (&cfg->cache, map->symbol, map->weight); } else { @@ -261,12 +223,12 @@ parse_mime_types (const gchar *str) } static gboolean -fuzzy_check_content_type (GMimeContentType *type) +fuzzy_check_content_type (struct fuzzy_rule *rule, GMimeContentType *type) { struct fuzzy_mime_type *ft; GList *cur; - cur = fuzzy_module_ctx->mime_types; + cur = rule->mime_types; while (cur) { ft = cur->data; if (g_mime_content_type_is_type (type, ft->type, ft->subtype)) { @@ -279,7 +241,7 @@ fuzzy_check_content_type (GMimeContentType *type) } static void -parse_servers_string (const gchar *str) +parse_servers_string (struct fuzzy_rule *rule, const gchar *str) { gchar **strvec; gint i, num; @@ -288,18 +250,18 @@ parse_servers_string (const gchar *str) strvec = g_strsplit_set (str, ",", 0); num = g_strv_length (strvec); - fuzzy_module_ctx->servers = memory_pool_alloc0 (fuzzy_module_ctx->fuzzy_pool, sizeof (struct storage_server) * num); + rule->servers = memory_pool_alloc0 (fuzzy_module_ctx->fuzzy_pool, sizeof (struct storage_server) * num); for (i = 0; i < num; i++) { g_strstrip (strvec[i]); - cur = &fuzzy_module_ctx->servers[fuzzy_module_ctx->servers_num]; + cur = &rule->servers[rule->servers_num]; if (parse_host_port (fuzzy_module_ctx->fuzzy_pool, strvec[i], &cur->addr, &cur->port)) { if (cur->port == 0) { cur->port = DEFAULT_PORT; } cur->name = memory_pool_strdup (fuzzy_module_ctx->fuzzy_pool, strvec[i]); - fuzzy_module_ctx->servers_num++; + rule->servers_num++; } } @@ -310,22 +272,14 @@ parse_servers_string (const gchar *str) static double fuzzy_normalize (gint32 in, double weight) { - double ms = weight, ams = fabs (ms), ain = fabs (in); - - if (ams > 0.001) { - if (ain < ams / 2.) { - return in; - } - else if (ain < ams * 2.) { - ain = ain / 3. + ams / 3.; - return in > 0 ? ain : -(ain); - } - else { - return in > 0 ? ms : -(ms); - } + if (weight == 0) { + return 0; } - - return (double)in; +#ifdef HAVE_TANH + return tanh ((double)in / weight); +#else + return (in < weight ? in / weight : weight); +#endif } static const gchar * @@ -349,16 +303,86 @@ fuzzy_to_string (fuzzy_hash_t *h) return strbuf; } +static struct fuzzy_rule * +fuzzy_rule_new (const char *default_symbol, memory_pool_t *pool) +{ + struct fuzzy_rule *rule; + + rule = memory_pool_alloc0 (pool, sizeof (struct fuzzy_rule)); + + rule->mappings = g_hash_table_new (g_direct_hash, g_direct_equal); + rule->symbol = default_symbol; + memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_unref, rule->mappings); + rule->read_only = TRUE; + + return rule; +} + +static gint +fuzzy_parse_rule (struct config_file *cfg, ucl_object_t *obj) +{ + ucl_object_t *value, *cur; + struct fuzzy_rule *rule; + ucl_object_iter_t it = NULL; + + if (obj->type != UCL_OBJECT) { + msg_err ("invalid rule definition"); + return -1; + } + + rule = fuzzy_rule_new (fuzzy_module_ctx->default_symbol, fuzzy_module_ctx->fuzzy_pool); + + if ((value = ucl_object_find_key (obj, "mime_types")) != NULL) { + if (value->type == UCL_ARRAY) { + value = value->value.av; + } + LL_FOREACH (value, cur) { + rule->mime_types = g_list_concat (rule->mime_types, + parse_mime_types (ucl_obj_tostring (cur))); + } + } + + if ((value = ucl_object_find_key (obj, "servers")) != NULL) { + if (value->type == UCL_ARRAY) { + value = value->value.av; + } + LL_FOREACH (value, cur) { + parse_servers_string (rule, ucl_obj_tostring (cur)); + } + } + if ((value = ucl_object_find_key (obj, "fuzzy_map")) != NULL) { + while ((cur = ucl_iterate_object (value, &it, true)) != NULL) { + parse_flags_string (rule, cfg, cur); + } + } + + if (rule->servers_num == 0) { + msg_err ("no servers defined for fuzzy rule with symbol: %s", rule->symbol); + return -1; + } + else { + fuzzy_module_ctx->fuzzy_rules = g_list_prepend (fuzzy_module_ctx->fuzzy_rules, rule); + if (rule->symbol != fuzzy_module_ctx->default_symbol) { + register_virtual_symbol (&cfg->cache, rule->symbol, 1.0); + } + } + + if ((value = ucl_object_find_key (obj, "max_score")) != NULL) { + rule->max_score = ucl_obj_todouble (value); + } + if ((value = ucl_object_find_key (obj, "read_only")) != NULL) { + rule->read_only = ucl_obj_toboolean (value); + } + + return 0; +} + gint fuzzy_check_module_init (struct config_file *cfg, struct module_ctx **ctx) { fuzzy_module_ctx = g_malloc0 (sizeof (struct fuzzy_ctx)); - fuzzy_module_ctx->filter = fuzzy_mime_filter; fuzzy_module_ctx->fuzzy_pool = memory_pool_new (memory_pool_get_size ()); - fuzzy_module_ctx->servers = NULL; - fuzzy_module_ctx->servers_num = 0; - fuzzy_module_ctx->mappings = g_hash_table_new (g_direct_hash, g_direct_equal); *ctx = (struct module_ctx *)fuzzy_module_ctx; @@ -369,20 +393,13 @@ gint fuzzy_check_module_config (struct config_file *cfg) { ucl_object_t *value, *cur; - ucl_object_iter_t it = NULL; - gint res = TRUE; + gint res = TRUE; if ((value = get_module_opt (cfg, "fuzzy_check", "symbol")) != NULL) { - fuzzy_module_ctx->symbol = ucl_obj_tostring (value); + fuzzy_module_ctx->default_symbol = ucl_obj_tostring (value); } else { - fuzzy_module_ctx->symbol = DEFAULT_SYMBOL; - } - if ((value = get_module_opt (cfg, "fuzzy_check", "max_score")) != NULL) { - fuzzy_module_ctx->max_score = ucl_obj_todouble (value); - } - else { - fuzzy_module_ctx->max_score = 0.; + fuzzy_module_ctx->default_symbol = DEFAULT_SYMBOL; } if ((value = get_module_opt (cfg, "fuzzy_check", "min_length")) != NULL) { @@ -415,11 +432,6 @@ fuzzy_check_module_config (struct config_file *cfg) else { fuzzy_module_ctx->io_timeout = DEFAULT_IO_TIMEOUT; } - if ((value = get_module_opt (cfg, "fuzzy_check", "mime_types")) != NULL) { - LL_FOREACH (value, cur) { - fuzzy_module_ctx->mime_types = parse_mime_types (ucl_obj_tostring (cur)); - } - } if ((value = get_module_opt (cfg, "fuzzy_check", "whitelist")) != NULL) { fuzzy_module_ctx->whitelist = radix_tree_create (); @@ -433,21 +445,24 @@ fuzzy_check_module_config (struct config_file *cfg) fuzzy_module_ctx->whitelist = NULL; } - if ((value = get_module_opt (cfg, "fuzzy_check", "servers")) != NULL) { + if ((value = get_module_opt (cfg, "fuzzy_check", "rule")) != NULL) { LL_FOREACH (value, cur) { - parse_servers_string (ucl_obj_tostring (cur)); - } - } - if ((value = get_module_opt (cfg, "fuzzy_check", "fuzzy_map")) != NULL) { - while ((cur = ucl_iterate_object (value, &it, true)) != NULL) { - parse_flags_string (cfg, cur); + if (fuzzy_parse_rule (cfg, cur) == -1) { + return -1; + } } } - register_symbol (&cfg->cache, fuzzy_module_ctx->symbol, fuzzy_module_ctx->max_score, fuzzy_symbol_callback, NULL); + if (fuzzy_module_ctx->fuzzy_rules != NULL) { + register_callback_symbol (&cfg->cache, fuzzy_module_ctx->default_symbol, + 1.0, fuzzy_symbol_callback, NULL); - register_custom_controller_command ("fuzzy_add", fuzzy_add_handler, TRUE, TRUE); - register_custom_controller_command ("fuzzy_del", fuzzy_delete_handler, TRUE, TRUE); + register_custom_controller_command ("fuzzy_add", fuzzy_add_handler, TRUE, TRUE); + register_custom_controller_command ("fuzzy_del", fuzzy_delete_handler, TRUE, TRUE); + } + else { + msg_warn ("fuzzy module is enabled but no rules are defined"); + } return res; } @@ -456,11 +471,9 @@ gint fuzzy_check_module_reconfig (struct config_file *cfg) { memory_pool_delete (fuzzy_module_ctx->fuzzy_pool); - fuzzy_module_ctx->servers = NULL; - fuzzy_module_ctx->servers_num = 0; + fuzzy_module_ctx->fuzzy_pool = memory_pool_new (memory_pool_get_size ()); - g_hash_table_remove_all (fuzzy_module_ctx->mappings); return fuzzy_check_module_config (cfg); } @@ -518,18 +531,18 @@ fuzzy_io_callback (gint fd, short what, void *arg) } *err_str = '\0'; /* Get mapping by flag */ - if ((map = g_hash_table_lookup (fuzzy_module_ctx->mappings, GINT_TO_POINTER (flag))) == NULL) { + if ((map = g_hash_table_lookup (session->rule->mappings, GINT_TO_POINTER (flag))) == NULL) { /* Default symbol and default weight */ - symbol = fuzzy_module_ctx->symbol; - nval = fuzzy_normalize (value, fuzzy_module_ctx->max_score); + symbol = session->rule->symbol; + nval = fuzzy_normalize (value, session->rule->max_score); } else { /* Get symbol and weight from map */ symbol = map->symbol; nval = fuzzy_normalize (value, map->weight); } - msg_info ("<%s>, found fuzzy hash '%s' with weight: %.2f, in list: %d", - session->task->message_id, fuzzy_to_string (session->h), flag, nval); + msg_info ("<%s>, found fuzzy hash '%s' with weight: %.2f, in list: %s:%d", + session->task->message_id, fuzzy_to_string (session->h), nval, symbol, flag); rspamd_snprintf (buf, sizeof (buf), "%d: %d / %.2f", flag, value, nval); insert_result (session->task, symbol, nval, g_list_prepend (NULL, memory_pool_strdup (session->task->task_pool, buf))); @@ -575,7 +588,9 @@ fuzzy_learn_callback (gint fd, short what, void *arg) cmd.flag = session->flag; if (write (fd, &cmd, sizeof (struct fuzzy_cmd)) == -1) { if (*(session->err) == NULL) { - g_set_error (session->err, g_quark_from_static_string ("fuzzy check"), 404, "write socket error: %s", strerror (errno)); + g_set_error (session->err, + g_quark_from_static_string ("fuzzy check"), + errno, "write socket error: %s", strerror (errno)); } goto err; } @@ -587,21 +602,27 @@ fuzzy_learn_callback (gint fd, short what, void *arg) } else if (what == EV_READ) { if (read (fd, buf, sizeof (buf)) == -1) { - msg_info ("cannot add fuzzy hash for message <%s>", session->task->message_id); + msg_info ("cannot add fuzzy hash for message <%s> to list %s:%d", session->task->message_id, + session->rule->symbol, session->flag); if (*(session->err) == NULL) { - g_set_error (session->err, g_quark_from_static_string ("fuzzy check"), 404, "read socket error: %s", strerror (errno)); + g_set_error (session->err, + g_quark_from_static_string ("fuzzy check"), + errno, "read socket error: %s", strerror (errno)); } goto err; } else if (buf[0] == 'O' && buf[1] == 'K') { - msg_info ("added fuzzy hash '%s' to list: %d for message <%s>", - fuzzy_to_string (session->h), session->flag, session->task->message_id); + msg_info ("added fuzzy hash '%s' to list: %s:%d for message <%s>", + fuzzy_to_string (session->h), session->rule->symbol, + session->flag, session->task->message_id); goto ok; } else { - msg_info ("cannot add fuzzy hash for message <%s>", session->task->message_id); + msg_info ("cannot add fuzzy hash for message <%s> to list %s:%d", session->task->message_id, + session->rule->symbol, session->flag); if (*(session->err) == NULL) { - g_set_error (session->err, g_quark_from_static_string ("fuzzy check"), 500, "add fuzzy error"); + g_set_error (session->err, + g_quark_from_static_string ("fuzzy check"), EINVAL, "add fuzzy error"); } goto ok; } @@ -614,7 +635,8 @@ fuzzy_learn_callback (gint fd, short what, void *arg) return; err: - msg_err ("got error in IO with server %s:%d, %d, %s", session->server->name, session->server->port, errno, strerror (errno)); + msg_err ("got error in IO with server %s, %d, %s", + session->server->name, errno, strerror (errno)); ok: if (--(*(session->saved)) == 0) { session->session->state = STATE_REPLY; @@ -648,7 +670,7 @@ ok: } static inline void -register_fuzzy_call (struct worker_task *task, fuzzy_hash_t *h) +register_fuzzy_call (struct worker_task *task, struct fuzzy_rule *rule, fuzzy_hash_t *h) { struct fuzzy_client_session *session; struct storage_server *selected; @@ -656,11 +678,12 @@ register_fuzzy_call (struct worker_task *task, fuzzy_hash_t *h) /* Get upstream */ #ifdef HAVE_CLOCK_GETTIME - selected = (struct storage_server *)get_upstream_by_hash (fuzzy_module_ctx->servers, fuzzy_module_ctx->servers_num, + selected = (struct storage_server *)get_upstream_by_hash (rule->servers, rule->servers_num, sizeof (struct storage_server), task->ts.tv_sec, - DEFAULT_UPSTREAM_ERROR_TIME, DEFAULT_UPSTREAM_DEAD_TIME, DEFAULT_UPSTREAM_MAXERRORS, h->hash_pipe, sizeof (h->hash_pipe)); + DEFAULT_UPSTREAM_ERROR_TIME, DEFAULT_UPSTREAM_DEAD_TIME, DEFAULT_UPSTREAM_MAXERRORS, + h->hash_pipe, sizeof (h->hash_pipe)); #else - selected = (struct storage_server *)get_upstream_by_hash (fuzzy_module_ctx->servers, fuzzy_module_ctx->servers_num, + selected = (struct storage_server *)get_upstream_by_hash (rule->servers, rule->servers_num, sizeof (struct storage_server), task->tv.tv_sec, DEFAULT_UPSTREAM_ERROR_TIME, DEFAULT_UPSTREAM_DEAD_TIME, DEFAULT_UPSTREAM_MAXERRORS, h->hash_pipe, sizeof (h->hash_pipe)); #endif @@ -678,15 +701,15 @@ register_fuzzy_call (struct worker_task *task, fuzzy_hash_t *h) session->task = task; session->fd = sock; session->server = selected; + session->rule = rule; event_add (&session->ev, &session->tv); register_async_event (task->s, fuzzy_io_fin, session, g_quark_from_static_string ("fuzzy check")); } } } -/* This callback is called when we check message via fuzzy hashes storage */ static void -fuzzy_symbol_callback (struct worker_task *task, void *unused) +fuzzy_check_rule (struct worker_task *task, struct fuzzy_rule *rule) { struct mime_text_part *part; struct mime_part *mime_part; @@ -696,25 +719,6 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused) GList *cur; fuzzy_hash_t *fake_fuzzy; - - /* Check whitelist */ -#ifdef HAVE_INET_PTON - if (fuzzy_module_ctx->whitelist && !task->from_addr.ipv6 && task->from_addr.d.in4.s_addr != INADDR_NONE) { - if (radix32tree_find (fuzzy_module_ctx->whitelist, ntohl ((guint32) task->from_addr.d.in4.s_addr)) != RADIX_NO_VALUE) { - msg_info ("<%s>, address %s is whitelisted, skip fuzzy check", - task->message_id, inet_ntoa (task->from_addr.d.in4)); - return; - } - } -#else - if (fuzzy_module_ctx->whitelist && task->from_addr.s_addr != 0) { - if (radix32tree_find (fuzzy_module_ctx->whitelist, ntohl ((guint32) task->from_addr.s_addr)) != RADIX_NO_VALUE) { - msg_info ("<%s>, address %s is whitelisted, skip fuzzy check", - task->message_id, inet_ntoa (task->from_addr)); - return; - } - } -#endif cur = task->text_parts; while (cur) { @@ -747,8 +751,8 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused) continue; } - register_fuzzy_call (task, part->fuzzy); - register_fuzzy_call (task, part->double_fuzzy); + register_fuzzy_call (task, rule, part->fuzzy); + register_fuzzy_call (task, rule, part->double_fuzzy); cur = g_list_next (cur); } @@ -763,7 +767,7 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused) /* Construct fake fuzzy hash */ fake_fuzzy = memory_pool_alloc0 (task->task_pool, sizeof (fuzzy_hash_t)); rspamd_strlcpy (fake_fuzzy->hash_pipe, checksum, sizeof (fake_fuzzy->hash_pipe)); - register_fuzzy_call (task, fake_fuzzy); + register_fuzzy_call (task, rule, fake_fuzzy); g_free (checksum); } } @@ -774,13 +778,14 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused) cur = task->parts; while (cur) { mime_part = cur->data; - if (mime_part->content->len > 0 && fuzzy_check_content_type (mime_part->type)) { + if (mime_part->content->len > 0 && fuzzy_check_content_type (rule, mime_part->type)) { if (fuzzy_module_ctx->min_bytes <= 0 || mime_part->content->len >= fuzzy_module_ctx->min_bytes) { - checksum = g_compute_checksum_for_data (G_CHECKSUM_MD5, mime_part->content->data, mime_part->content->len); + checksum = g_compute_checksum_for_data (G_CHECKSUM_MD5, + mime_part->content->data, mime_part->content->len); /* Construct fake fuzzy hash */ fake_fuzzy = memory_pool_alloc0 (task->task_pool, sizeof (fuzzy_hash_t)); rspamd_strlcpy (fake_fuzzy->hash_pipe, checksum, sizeof (fake_fuzzy->hash_pipe)); - register_fuzzy_call (task, fake_fuzzy); + register_fuzzy_call (task, rule, fake_fuzzy); g_free (checksum); } } @@ -788,41 +793,64 @@ fuzzy_symbol_callback (struct worker_task *task, void *unused) } } +/* This callback is called when we check message via fuzzy hashes storage */ +static void +fuzzy_symbol_callback (struct worker_task *task, void *unused) +{ + struct fuzzy_rule *rule; + GList *cur; + + /* Check whitelist */ +#ifdef HAVE_INET_PTON + if (fuzzy_module_ctx->whitelist && !task->from_addr.ipv6 && task->from_addr.d.in4.s_addr != INADDR_NONE) { + if (radix32tree_find (fuzzy_module_ctx->whitelist, ntohl ((guint32) task->from_addr.d.in4.s_addr)) != RADIX_NO_VALUE) { + msg_info ("<%s>, address %s is whitelisted, skip fuzzy check", + task->message_id, inet_ntoa (task->from_addr.d.in4)); + return; + } + } +#else + if (fuzzy_module_ctx->whitelist && task->from_addr.s_addr != 0) { + if (radix32tree_find (fuzzy_module_ctx->whitelist, ntohl ((guint32) task->from_addr.s_addr)) != RADIX_NO_VALUE) { + msg_info ("<%s>, address %s is whitelisted, skip fuzzy check", + task->message_id, inet_ntoa (task->from_addr)); + return; + } + } +#endif + + cur = fuzzy_module_ctx->fuzzy_rules; + while (cur) { + rule = cur->data; + fuzzy_check_rule (task, rule); + cur = g_list_next (cur); + } +} + static inline gboolean -register_fuzzy_controller_call (struct controller_session *session, struct worker_task *task, fuzzy_hash_t *h, - gint cmd, gint value, gint flag, gint *saved, GError **err) +register_fuzzy_controller_call (struct controller_session *session, + struct fuzzy_rule *rule, struct worker_task *task, fuzzy_hash_t *h, + gint cmd, gint value, gint flag, gint *saved, GError **err) { struct fuzzy_learn_session *s; struct storage_server *selected; - gint sock, r; - gchar out_buf[BUFSIZ]; + gint sock; /* Get upstream */ #ifdef HAVE_CLOCK_GETTIME - selected = (struct storage_server *)get_upstream_by_hash (fuzzy_module_ctx->servers, fuzzy_module_ctx->servers_num, + selected = (struct storage_server *)get_upstream_by_hash (rule->servers, rule->servers_num, sizeof (struct storage_server), task->ts.tv_sec, - DEFAULT_UPSTREAM_ERROR_TIME, DEFAULT_UPSTREAM_DEAD_TIME, DEFAULT_UPSTREAM_MAXERRORS, h->hash_pipe, sizeof (h->hash_pipe)); + DEFAULT_UPSTREAM_ERROR_TIME, DEFAULT_UPSTREAM_DEAD_TIME, DEFAULT_UPSTREAM_MAXERRORS, + h->hash_pipe, sizeof (h->hash_pipe)); #else - selected = (struct storage_server *)get_upstream_by_hash (fuzzy_module_ctx->servers, fuzzy_module_ctx->servers_num, + selected = (struct storage_server *)get_upstream_by_hash (rule->servers, rule->servers_num, sizeof (struct storage_server), task->tv.tv_sec, - DEFAULT_UPSTREAM_ERROR_TIME, DEFAULT_UPSTREAM_DEAD_TIME, DEFAULT_UPSTREAM_MAXERRORS, h->hash_pipe, sizeof (h->hash_pipe)); + DEFAULT_UPSTREAM_ERROR_TIME, DEFAULT_UPSTREAM_DEAD_TIME, DEFAULT_UPSTREAM_MAXERRORS, + h->hash_pipe, sizeof (h->hash_pipe)); #endif if (selected) { /* Create UDP socket */ if ((sock = make_universal_socket (selected->addr, selected->port, SOCK_DGRAM, TRUE, FALSE, FALSE)) == -1) { - msg_warn ("cannot connect to %s, %d, %s", selected->name, errno, strerror (errno)); - session->state = STATE_REPLY; - if (session->restful) { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "HTTP/1.0 404 No hashes have been written" CRLF CRLF); - } - else { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "no hashes have been written" CRLF "END" CRLF); - } - if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { - return FALSE; - } - free_task (task, FALSE); - rspamd_dispatcher_restore (session->dispatcher); return FALSE; } else { @@ -841,28 +869,117 @@ register_fuzzy_controller_call (struct controller_session *session, struct worke s->saved = saved; s->fd = sock; s->err = err; + s->rule = rule; event_add (&s->ev, &s->tv); (*saved)++; register_async_event (session->s, fuzzy_learn_fin, s, g_quark_from_static_string ("fuzzy check")); return TRUE; } } + return FALSE; } -static void -fuzzy_process_handler (struct controller_session *session, f_str_t * in) +static gboolean +fuzzy_process_rule (struct controller_session *session, struct fuzzy_rule *rule, + struct worker_task *task, GError **err, gint cmd, gint flag, gint value, gint *saved) { - struct worker_task *task; struct mime_text_part *part; struct mime_part *mime_part; struct rspamd_image *image; - GList *cur; - GError **err; - gint r, cmd = 0, value = 0, flag = 0, *saved, *sargs; - gchar out_buf[BUFSIZ], *checksum; + GList *cur; + gchar *checksum; fuzzy_hash_t fake_fuzzy; + /* Plan new event for writing */ + cur = task->text_parts; + + while (cur) { + part = cur->data; + if (part->is_empty || part->fuzzy == NULL || part->fuzzy->hash_pipe[0] == '\0' || + (fuzzy_module_ctx->min_bytes > 0 && part->content->len < fuzzy_module_ctx->min_bytes)) { + /* Skip empty parts */ + cur = g_list_next (cur); + continue; + } + if (! register_fuzzy_controller_call (session, rule, task, + part->fuzzy, cmd, value, flag, saved, err)) { + return FALSE; + } + if (! register_fuzzy_controller_call (session, rule, task, + part->double_fuzzy, cmd, value, flag, saved, err)) { + /* Cannot write hash */ + return FALSE; + } + cur = g_list_next (cur); + } + + /* Process images */ + cur = task->images; + while (cur) { + image = cur->data; + if (image->data->len > 0) { + if (fuzzy_module_ctx->min_height <= 0 || image->height >= fuzzy_module_ctx->min_height) { + if (fuzzy_module_ctx->min_width <= 0 || image->width >= fuzzy_module_ctx->min_width) { + checksum = g_compute_checksum_for_data (G_CHECKSUM_MD5, image->data->data, image->data->len); + /* Construct fake fuzzy hash */ + fake_fuzzy.block_size = 0; + memset (fake_fuzzy.hash_pipe, 0, sizeof (fake_fuzzy.hash_pipe)); + rspamd_strlcpy (fake_fuzzy.hash_pipe, checksum, sizeof (fake_fuzzy.hash_pipe)); + if (! register_fuzzy_controller_call (session, rule, task, + &fake_fuzzy, cmd, value, flag, saved, err)) { + g_free (checksum); + return FALSE; + } + + msg_info ("save hash of image: [%s] to list: %d", checksum, flag); + g_free (checksum); + } + } + } + cur = g_list_next (cur); + } + /* Process other parts */ + cur = task->parts; + while (cur) { + mime_part = cur->data; + if (mime_part->content->len > 0 && fuzzy_check_content_type (rule, mime_part->type)) { + if (fuzzy_module_ctx->min_bytes <= 0 || mime_part->content->len >= fuzzy_module_ctx->min_bytes) { + checksum = g_compute_checksum_for_data (G_CHECKSUM_MD5, + mime_part->content->data, mime_part->content->len); + /* Construct fake fuzzy hash */ + fake_fuzzy.block_size = 0; + memset (fake_fuzzy.hash_pipe, 0, sizeof (fake_fuzzy.hash_pipe)); + rspamd_strlcpy (fake_fuzzy.hash_pipe, checksum, sizeof (fake_fuzzy.hash_pipe)); + if (! register_fuzzy_controller_call (session, rule, task, + &fake_fuzzy, cmd, value, flag, saved, err)) { + return FALSE; + } + msg_info ("save hash of part of type: %s/%s: [%s] to list %d", + mime_part->type->type, mime_part->type->subtype, + checksum, flag); + g_free (checksum); + } + } + cur = g_list_next (cur); + } + + memory_pool_add_destructor (session->session_pool, (pool_destruct_func)free_task_soft, task); + + return TRUE; +} + +static void +fuzzy_process_handler (struct controller_session *session, f_str_t * in) +{ + struct fuzzy_rule *rule; + gboolean processed = FALSE, res = TRUE; + GList *cur; + struct worker_task *task; + GError **err; + gint r, cmd = 0, value = 0, flag = 0, *saved, *sargs; + gchar out_buf[BUFSIZ]; + /* Extract arguments */ if (session->other_data) { sargs = session->other_data; @@ -870,17 +987,17 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in) value = sargs[1]; flag = sargs[2]; } - + /* Prepare task */ task = construct_task (session->worker); session->other_data = task; session->state = STATE_WAIT; - + /* Allocate message from string */ task->msg = memory_pool_alloc (task->task_pool, sizeof (f_str_t)); task->msg->begin = in->begin; task->msg->len = in->len; - + saved = memory_pool_alloc0 (session->session_pool, sizeof (gint)); err = memory_pool_alloc0 (session->session_pool, sizeof (GError *)); @@ -901,130 +1018,29 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in) rspamd_dispatcher_restore (session->dispatcher); return; } - else { - /* Plan new event for writing */ - cur = task->text_parts; + cur = fuzzy_module_ctx->fuzzy_rules; + while (cur && res) { + rule = cur->data; - while (cur) { - part = cur->data; - if (part->is_empty || part->fuzzy == NULL || part->fuzzy->hash_pipe[0] == '\0' || - (fuzzy_module_ctx->min_bytes > 0 && part->content->len < fuzzy_module_ctx->min_bytes)) { - /* Skip empty parts */ - cur = g_list_next (cur); - continue; - } - if (! register_fuzzy_controller_call (session, task, part->fuzzy, cmd, value, flag, saved, err)) { - /* Cannot write hash */ - session->state = STATE_REPLY; - if (session->restful) { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "HTTP/1.0 500 Cannot write fuzzy hash" CRLF CRLF); - } - else { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "cannot write fuzzy hash" CRLF "END" CRLF); - } - if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { - return; - } - rspamd_dispatcher_restore (session->dispatcher); - free_task (task, FALSE); - return; - } - if (! register_fuzzy_controller_call (session, task, part->double_fuzzy, cmd, value, flag, saved, err)) { - /* Cannot write hash */ - session->state = STATE_REPLY; - if (session->restful) { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "HTTP/1.0 500 Cannot write fuzzy hash" CRLF CRLF); - } - else { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "cannot write fuzzy hash" CRLF "END" CRLF); - } - if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { - return; - } - free_task (task, FALSE); - rspamd_dispatcher_restore (session->dispatcher); - return; - } - cur = g_list_next (cur); - } - /* Process images */ - cur = task->images; - while (cur) { - image = cur->data; - if (image->data->len > 0) { - if (fuzzy_module_ctx->min_height <= 0 || image->height >= fuzzy_module_ctx->min_height) { - if (fuzzy_module_ctx->min_width <= 0 || image->width >= fuzzy_module_ctx->min_width) { - checksum = g_compute_checksum_for_data (G_CHECKSUM_MD5, image->data->data, image->data->len); - /* Construct fake fuzzy hash */ - fake_fuzzy.block_size = 0; - bzero (fake_fuzzy.hash_pipe, sizeof (fake_fuzzy.hash_pipe)); - rspamd_strlcpy (fake_fuzzy.hash_pipe, checksum, sizeof (fake_fuzzy.hash_pipe)); - if (! register_fuzzy_controller_call (session, task, &fake_fuzzy, cmd, value, flag, saved, err)) { - /* Cannot write hash */ - session->state = STATE_REPLY; - if (session->restful) { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "HTTP/1.0 500 Cannot write fuzzy hash" CRLF CRLF); - } - else { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "cannot write fuzzy hash" CRLF "END" CRLF); - } - g_free (checksum); - free_task (task, FALSE); - if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { - return; - } - rspamd_dispatcher_restore (session->dispatcher); - return; - } - - msg_info ("save hash of image: [%s] to list: %d", checksum, flag); - g_free (checksum); - } - } - } + if (rule->read_only) { cur = g_list_next (cur); + continue; } - /* Process other parts */ - cur = task->parts; - while (cur) { - mime_part = cur->data; - if (mime_part->content->len > 0 && fuzzy_check_content_type (mime_part->type)) { - if (fuzzy_module_ctx->min_bytes <= 0 || mime_part->content->len >= fuzzy_module_ctx->min_bytes) { - checksum = g_compute_checksum_for_data (G_CHECKSUM_MD5, mime_part->content->data, mime_part->content->len); - /* Construct fake fuzzy hash */ - fake_fuzzy.block_size = 0; - bzero (fake_fuzzy.hash_pipe, sizeof (fake_fuzzy.hash_pipe)); - rspamd_strlcpy (fake_fuzzy.hash_pipe, checksum, sizeof (fake_fuzzy.hash_pipe)); - if (! register_fuzzy_controller_call (session, task, &fake_fuzzy, cmd, value, flag, saved, err)) { - /* Cannot write hash */ - session->state = STATE_REPLY; - if (session->restful) { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "HTTP/1.0 500 Cannot write fuzzy hash" CRLF CRLF); - } - else { - r = rspamd_snprintf (out_buf, sizeof (out_buf), "cannot write fuzzy hash" CRLF "END" CRLF); - } - g_free (checksum); - free_task (task, FALSE); - if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { - return; - } - rspamd_dispatcher_restore (session->dispatcher); - return; - } - msg_info ("save hash of part of type: %s/%s: [%s] to list %d", - mime_part->type->type, mime_part->type->subtype, - checksum, flag); - g_free (checksum); - } - } + + /* Check for flag */ + if (g_hash_table_lookup (rule->mappings, GINT_TO_POINTER (flag)) == NULL) { cur = g_list_next (cur); + continue; } - } - memory_pool_add_destructor (session->session_pool, (pool_destruct_func)free_task_soft, task); + processed = TRUE; + + res = fuzzy_process_rule (session, rule, task, err, cmd, flag, value, saved); + + cur = g_list_next (cur); + } - if (*saved == 0) { + if (!res) { session->state = STATE_REPLY; if (session->restful) { r = rspamd_snprintf (out_buf, sizeof (out_buf), "HTTP/1.0 404 No hashes have been written" CRLF CRLF); @@ -1037,6 +1053,19 @@ fuzzy_process_handler (struct controller_session *session, f_str_t * in) } rspamd_dispatcher_restore (session->dispatcher); } + else if (!processed) { + session->state = STATE_REPLY; + if (session->restful) { + r = rspamd_snprintf (out_buf, sizeof (out_buf), "HTTP/1.0 404 No fuzzy rules matched" CRLF CRLF); + } + else { + r = rspamd_snprintf (out_buf, sizeof (out_buf), "no fuzzy rules matched" CRLF "END" CRLF); + } + if (! rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE)) { + return; + } + rspamd_dispatcher_restore (session->dispatcher); + } } static void @@ -1155,10 +1184,3 @@ fuzzy_delete_handler (gchar **args, struct controller_session *session) { fuzzy_controller_handler (args, session, FUZZY_DEL); } - -static gint -fuzzy_mime_filter (struct worker_task *task) -{ - /* XXX: remove this */ - return 0; -} |