From 0dc48ea239965d05b760cb9d8e570e0d91aedb77 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 27 May 2010 18:59:02 +0400 Subject: [PATCH] * Convert statistic sums to use long double for counters * Use hyperbolic tangent for internal normalizer --- CMakeLists.txt | 1 - src/cfg_file.h | 2 +- src/cfg_utils.c | 19 +++------ src/classifiers/classifiers.h | 2 +- src/classifiers/winnow.c | 40 ++++++++---------- src/controller.c | 2 +- src/lua/lua_common.c | 4 +- src/lua/lua_common.h | 2 +- src/util.c | 78 +++++++++++++++++++++++++++++++---- 9 files changed, 99 insertions(+), 51 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e8896ac00..ef6e02b57 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,6 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0 FATAL_ERROR) OPTION(DEBUG_MODE "Enable debug output [default: ON]" ON) OPTION(ENABLE_OPTIMIZATION "Enable optimization [default: OFF]" OFF) -OPTION(ENABLE_PERL "Enable perl support [default: OFF]" OFF) OPTION(SKIP_RELINK_RPATH "Skip relinking and full RPATH for the install tree" OFF) OPTION(ENABLE_REDIRECTOR "Enable redirector install [default: OFF]" OFF) OPTION(ENABLE_PROFILING "Enable profiling [default: OFF]" OFF) diff --git a/src/cfg_file.h b/src/cfg_file.h index baf65c377..dec843359 100644 --- a/src/cfg_file.h +++ b/src/cfg_file.h @@ -161,7 +161,7 @@ struct statfile_binlog_params { uint16_t master_port; }; -typedef double (*statfile_normalize_func)(struct config_file *cfg, double score, void *params); +typedef double (*statfile_normalize_func)(struct config_file *cfg, long double score, void *params); /** * Statfile config definition diff --git a/src/cfg_utils.c b/src/cfg_utils.c index f72bf51a3..d63e6bc93 100644 --- a/src/cfg_utils.c +++ b/src/cfg_utils.c @@ -719,25 +719,16 @@ check_worker_conf (struct config_file *cfg, struct worker_conf *c) } static double -internal_normalizer_func (struct config_file *cfg, double score, void *data) +internal_normalizer_func (struct config_file *cfg, long double score, void *data) { - double max = *(double *)data; + long double max = *(double *)data; if (score < 0) { return score; } - else if (score > 0.001 && score < 1) { - return 1; - } - else if (score > 1 && score < max / 2.) { - return MIN(max, score * score); - } - else if (score < max) { - return score; - } - else if (score > max) { - return max; - } + else { + return max * tanhl (score / max); + } return score; } diff --git a/src/classifiers/classifiers.h b/src/classifiers/classifiers.h index de937bc3f..02192d795 100644 --- a/src/classifiers/classifiers.h +++ b/src/classifiers/classifiers.h @@ -17,7 +17,7 @@ struct classifier_ctx { struct classify_weight { const char *name; - double weight; + long double weight; }; /* Common classifier structure */ diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c index 637be759d..1d48cc2ba 100644 --- a/src/classifiers/winnow.c +++ b/src/classifiers/winnow.c @@ -51,7 +51,7 @@ struct winnow_callback_data { struct classifier_ctx *ctx; stat_file_t *file; stat_file_t *learn_file; - double sum; + long double sum; double multiplier; int count; gboolean in_class; @@ -71,12 +71,7 @@ classify_callback (gpointer key, gpointer value, gpointer data) /* Consider that not found blocks have value 1 */ v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now); if (fabs (v) > ALPHA) { - if (cd->sum + v > MAX_WEIGHT) { - cd->sum = MAX_WEIGHT; - } - else { - cd->sum += v; - } + cd->sum += v; cd->in_class++; } @@ -160,12 +155,7 @@ learn_callback (gpointer key, gpointer value, gpointer data) } - if (cd->sum + node->value > MAX_WEIGHT) { - cd->sum = MAX_WEIGHT; - } - else { - cd->sum += node->value; - } + cd->sum += node->value; cd->count++; @@ -188,7 +178,7 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp { struct winnow_callback_data data; char *sumbuf, *value; - double res = 0., max = 0.; + long double res = 0., max = 0.; GList *cur; struct statfile *st, *sel = NULL; int nodes, minnodes; @@ -258,7 +248,7 @@ winnow_classify (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * inp if (sel != NULL) { sumbuf = memory_pool_alloc (task->task_pool, 32); - snprintf (sumbuf, 32, "%.2f", max); + snprintf (sumbuf, 32, "%.2Lg", max); cur = g_list_prepend (NULL, sumbuf); #ifdef WITH_LUA max = call_classifier_post_callbacks (ctx->cfg, task, max); @@ -271,7 +261,7 @@ GList * winnow_weights (struct classifier_ctx *ctx, statfile_pool_t * pool, GTree * input, struct worker_task *task) { struct winnow_callback_data data; - double res = 0.; + long double res = 0.; GList *cur, *resl = NULL; struct statfile *st; struct classify_weight *w; @@ -346,7 +336,7 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *fi int nodes, minnodes, iterations = 0; struct statfile *st; stat_file_t *sel; - double res = 0., max = 0.; + long double res = 0., max = 0.; GList *cur; g_assert (pool != NULL); @@ -407,12 +397,16 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, stat_file_t *fi } } while ((in_class ? sel != file : sel == file) && iterations ++ < MAX_LEARN_ITERATIONS); + if (iterations >= MAX_LEARN_ITERATIONS) { + msg_warn ("learning statfile %s was not fully successfull: iterations count is limited to %d, final sum is %G", + file->filename, MAX_LEARN_ITERATIONS, max); + } + else { + msg_info ("learned statfile %s successfully with %d iterations and sum %G", file->filename, iterations, max); + } + + if (sum) { - if (data.count != 0) { - *sum = data.sum / data.count; - } - else { - *sum = 0; - } + *sum = max; } } diff --git a/src/controller.c b/src/controller.c index d3c5e9e70..236c3eca9 100644 --- a/src/controller.c +++ b/src/controller.c @@ -849,7 +849,7 @@ controller_read_socket (f_str_t * in, void *arg) while (cur) { w = cur->data; - i += snprintf (out_buf + i, sizeof (out_buf) - i, "%s: %.2f" CRLF, w->name, w->weight); + i += snprintf (out_buf + i, sizeof (out_buf) - i, "%s: %.2Lg" CRLF, w->name, w->weight); cur = g_list_next (cur); } if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) { diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c index fc5fe0772..d1a2b614b 100644 --- a/src/lua/lua_common.c +++ b/src/lua/lua_common.c @@ -415,10 +415,10 @@ lua_consolidation_func (struct worker_task *task, const char *metric_name, const } double -lua_normalizer_func (struct config_file *cfg, double score, void *params) +lua_normalizer_func (struct config_file *cfg, long double score, void *params) { GList *p = params; - double res = score; + long double res = score; lua_State *L = cfg->lua_state; /* Call specified function and put input score on stack */ diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index f89ccaa30..ffed03e58 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -44,7 +44,7 @@ void add_luabuf (const char *line); GList *call_classifier_pre_callbacks (struct classifier_config *ccf, struct worker_task *task); double call_classifier_post_callbacks (struct classifier_config *ccf, struct worker_task *task, double in); -double lua_normalizer_func (struct config_file *cfg, double score, void *params); +double lua_normalizer_func (struct config_file *cfg, long double score, void *params); /* Config file functions */ void lua_post_load_config (struct config_file *cfg); diff --git a/src/util.c b/src/util.c index c093ccc36..bf0a491f3 100644 --- a/src/util.c +++ b/src/util.c @@ -1034,7 +1034,10 @@ get_statfile_by_symbol (statfile_pool_t *pool, struct classifier_config *ccf, * %[0][width|m][u][x|X]i int/ngx_int_t * %[0][width][u][x|X]D int32_t/uint32_t * %[0][width][u][x|X]L int64_t/uint64_t - * %[0][width][.width]f float + * %[0][width][.width]f double + * %[0][width][.width]F long double + * %[0][width][.width]g double + * %[0][width][.width]G long double * %P pid_t * %r rlim_t * %p void * @@ -1082,7 +1085,7 @@ rspamd_vsnprintf (u_char *buf, size_t max, const char *fmt, va_list args) { u_char *p, zero, *last; int d; - float f, scale; + long double f, scale; size_t len, slen; int64_t i64; uint64_t ui64; @@ -1144,7 +1147,6 @@ rspamd_vsnprintf (u_char *buf, size_t max, const char *fmt, va_list args) sign = 0; fmt++; continue; - case '.': fmt++; @@ -1258,7 +1260,43 @@ rspamd_vsnprintf (u_char *buf, size_t max, const char *fmt, va_list args) case 'f': - f = (float) va_arg (args, double); + f = (double) va_arg (args, double); + if (f < 0) { + *buf++ = '-'; + f = -f; + } + + ui64 = (int64_t) f; + + buf = rspamd_sprintf_num (buf, last, ui64, zero, 0, width); + + if (frac_width) { + + if (buf < last) { + *buf++ = '.'; + } + + scale = 1.0; + + for (i = 0; i < frac_width; i++) { + scale *= 10.0; + } + + /* + * (int64_t) cast is required for msvc6: + * it can not convert uint64_t to double + */ + ui64 = (uint64_t) ((f - (int64_t) ui64) * scale); + + buf = rspamd_sprintf_num (buf, last, ui64, '0', 0, frac_width); + } + + fmt++; + + continue; + + case 'F': + f = (long double) va_arg (args, long double); if (f < 0) { *buf++ = '-'; @@ -1282,9 +1320,9 @@ rspamd_vsnprintf (u_char *buf, size_t max, const char *fmt, va_list args) } /* - * (int64_t) cast is required for msvc6: - * it can not convert uint64_t to double - */ + * (int64_t) cast is required for msvc6: + * it can not convert uint64_t to double + */ ui64 = (uint64_t) ((f - (int64_t) ui64) * scale); buf = rspamd_sprintf_num (buf, last, ui64, '0', 0, frac_width); @@ -1294,6 +1332,32 @@ rspamd_vsnprintf (u_char *buf, size_t max, const char *fmt, va_list args) continue; + case 'g': + f = (long double) va_arg (args, double); + + if (f < 0) { + *buf++ = '-'; + f = -f; + } + g_ascii_formatd (buf, last - buf, "%g", (double)f); + buf += strlen (buf); + fmt++; + + continue; + + case 'G': + f = (long double) va_arg (args, long double); + + if (f < 0) { + *buf++ = '-'; + f = -f; + } + g_ascii_formatd (buf, last - buf, "%g", (double)f); + buf += strlen (buf); + fmt++; + + continue; + case 'p': ui64 = (uintptr_t) va_arg (args, void *); hex = 2; -- 2.39.5