* One more try to improve accuracy of winnow algorithm

author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2010-08-06 20:48:11 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2010-08-06 20:48:11 +0400
commit: 9aa989ea76ee78107ed7ae02a3d1b8e297f57b4c (patch)
tree: c925bd5a8df5394f583ba79ae69794338dd0821a
parent: 5d0e4d334fef7f0fe683040d32e2a53b503315f2 (diff)
download: rspamd-9aa989ea76ee78107ed7ae02a3d1b8e297f57b4c.tar.gz
rspamd-9aa989ea76ee78107ed7ae02a3d1b8e297f57b4c.zip
2 files changed, 29 insertions, 13 deletions
diff --git a/src/classifiers/winnow.c b/src/classifiers/winnow.c
index ab155ff8c..704f65b0a 100644
--- a/src/classifiers/winnow.c
+++ b/src/classifiers/winnow.c
@@ -58,6 +58,7 @@ struct winnow_callback_data {
 	stat_file_t                    *file;
 	stat_file_t                    *learn_file;
 	long double                     sum;
+	long double 					start;
 	double                          multiplier;
 	guint32                         count;
 	guint32                         new_blocks;
@@ -106,6 +107,7 @@ learn_callback (gpointer key, gpointer value, gpointer data)
 	v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now);
 	if (fabs (v) < ALPHA) {
 		/* Block not found, insert new */
+		cd->start += 1;
 		if (cd->file == cd->learn_file) {
 			statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c);
 			node->value = c;
@@ -113,6 +115,7 @@ learn_callback (gpointer key, gpointer value, gpointer data)
 		}
 	}
 	else {
+		cd->start += v;
 		/* Here we just increase the extra value of block */
 		if (cd->fresh_run) {
 			node->extra = 0;
@@ -354,7 +357,7 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
 	int                             nodes, minnodes, iterations = 0;
 	struct statfile                *st, *sel_st;
 	stat_file_t                    *sel = NULL, *to_learn;
-	long double                     res = 0., max = 0.;
+	long double                     res = 0., max = 0., start_value, end_value;
 	double                          learn_threshold = 0.0;
 	GList                          *cur, *to_demote = NULL;
 	gboolean                        force_learn = FALSE;
@@ -443,7 +446,7 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
 			max = 0;
 		}
 		/* If most of blocks are not presented in targeted statfile do forced learn */
-		if ((data.new_blocks > 1 && (double)data.new_blocks / (double)data.count > 0.5) || max < 1 + learn_threshold) {
+		if (max < 1 + learn_threshold) {
 			force_learn = TRUE;
 		}
 		/* Check other statfiles */
@@ -483,12 +486,13 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
 		return FALSE;
 	}
 	/* If to_demote list is empty this message is already classified correctly */
-	if (max > ALPHA && to_demote == NULL && !force_learn) {
+	if (max > WINNOW_PROMOTION && to_demote == NULL && !force_learn) {
 		msg_info ("this message is already of class %s with threshold %.2f and weight %.2F",
 				sel_st->symbol, learn_threshold, max);
 		goto end;
 	}
 	data.learn_file = to_learn;
+	end_value = max;
 	do {
 		cur = ctx->cfg->statfiles;
 		data.fresh_run = TRUE;
@@ -497,6 +501,7 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
 			data.sum = 0;
 			data.count = 0;
 			data.new_blocks = 0;
+			data.start = 0;
 			if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
 				return FALSE;
 			}
@@ -506,6 +511,7 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
 			else {
 				data.do_demote = FALSE;
 			}
+
 			statfile_pool_lock_file (pool, data.file);
 			g_tree_foreach (input, learn_callback, &data);
 			statfile_pool_unlock_file (pool, data.file);
@@ -519,16 +525,19 @@ winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, const char *sym
 				max = res;
 				sel = data.file;
 			}
+			if (data.file == to_learn) {
+				if (data.count > 0) {
+					start_value = data.start / data.count;
+				}
+				end_value = res;
+			}
 			cur = g_list_next (cur);
 			data.fresh_run = FALSE;
 		}
-		
-		if (data.multiplier > 1) {
-			data.multiplier *= data.multiplier;
-		}
-		else {
-			data.multiplier *= WINNOW_PROMOTION;
-		}
+
+		data.multiplier *= WINNOW_PROMOTION;
+		msg_info ("learn iteration %d for statfile %s: %G -> %G, multiplier: %.2f", iterations + 1, symbol,
+				start_value, end_value, data.multiplier);
 	} while ((in_class ? sel != to_learn : sel == to_learn)  && iterations ++ < MAX_LEARN_ITERATIONS);
 	
 	if (iterations >= MAX_LEARN_ITERATIONS) {
diff --git a/src/controller.c b/src/controller.c
index 15b14cf8d..a79159c69 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -926,11 +926,18 @@ controller_read_socket (f_str_t * in, void *arg)
 
 		while (cur) {
 			w = cur->data;
-			i += rspamd_snprintf (out_buf + i, sizeof (out_buf) - i, "%s: %.2Lg" CRLF, w->name, w->weight);
+			i += rspamd_snprintf (out_buf + i, sizeof (out_buf) - i, "%s: %G" CRLF, w->name, w->weight);
 			cur = g_list_next (cur);
 		}
-		if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
-			return FALSE;
+		if (i != 0) {
+			if (!rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE)) {
+				return FALSE;
+			}
+		}
+		else {
+			if (!rspamd_dispatcher_write (session->dispatcher, "weights failed: classifier error", 0, FALSE, TRUE)) {
+				return FALSE;
+			}
 		}
 
 		free_task (task, FALSE);
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2010-08-06 20:48:11 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2010-08-06 20:48:11 +0400
commit	9aa989ea76ee78107ed7ae02a3d1b8e297f57b4c (patch)
tree	c925bd5a8df5394f583ba79ae69794338dd0821a
parent	5d0e4d334fef7f0fe683040d32e2a53b503315f2 (diff)
download	rspamd-9aa989ea76ee78107ed7ae02a3d1b8e297f57b4c.tar.gz rspamd-9aa989ea76ee78107ed7ae02a3d1b8e297f57b4c.zip