* Add max_tokens options to avoid classifying and learning with too much tokens from one message.

Fix stupid memory leakage on client's timeout.
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-07-26 16:57:36 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-07-26 16:57:36 +0400
commit: 6eded20b2c15e524ed9a83c436a3b5f0bfbd253c (patch)
tree: b5a5d7bf336a7eaf965d72554c8b51028f9eb0d1 /src/classifiers
parent: 8e4282bb260132232c26f3f87f48181044ea6cc3 (diff)
download: rspamd-6eded20b2c15e524ed9a83c436a3b5f0bfbd253c.tar.gz
rspamd-6eded20b2c15e524ed9a83c436a3b5f0bfbd253c.zip
1 files changed, 47 insertions, 7 deletions
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index 5d00505be..c006228b4 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -60,7 +60,8 @@ struct bayes_callback_data {
 	stat_file_t                    *file;
 	struct bayes_statfile_data     *statfiles;
 	guint32                         statfiles_num;
-	guint64                          learned_tokens;
+	guint64                         learned_tokens;
+	gsize                           max_tokens;
 };
 
 static                          gboolean
@@ -92,6 +93,10 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data)
 		cd->learned_tokens ++;
 	}
 
+	if (cd->max_tokens != 0 && cd->learned_tokens > cd->max_tokens) {
+		/* Stop learning on max tokens */
+		return TRUE;
+	}
 	return FALSE;
 }
 
@@ -151,6 +156,12 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data)
 		}
 	}
 
+	cd->learned_tokens ++;
+	if (cd->max_tokens != 0 && cd->learned_tokens > cd->max_tokens) {
+		/* Stop classifying on max tokens */
+		return TRUE;
+	}
+
 	return FALSE;
 }
 
@@ -171,7 +182,8 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
 {
 	struct bayes_callback_data      data;
 	gchar                          *value;
-	gint                            nodes, minnodes, i = 0, cnt, best_num = 0;
+	gint                            nodes, i = 0, cnt, best_num = 0;
+	gsize                           minnodes;
 	guint64                         rev, total_learns = 0;
 	double                          best = 0;
 	struct statfile                *st;
@@ -207,6 +219,15 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input,
 	data.now = time (NULL);
 	data.ctx = ctx;
 
+	data.learned_tokens = 0;
+	if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+		minnodes = parse_limit (value);
+		data.max_tokens = minnodes;
+	}
+	else {
+		data.max_tokens = 0;
+	}
+
 	while (cur) {
 		/* Select statfile to classify */
 		st = cur->data;
@@ -264,8 +285,9 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
 				gboolean in_class, double *sum, double multiplier, GError **err)
 {
 	struct bayes_callback_data      data;
-	char                           *value;
-	int                             nodes, minnodes;
+	gchar                          *value;
+	gint                            nodes;
+	gsize                           minnodes;
 	struct statfile                *st, *sel_st = NULL;
 	stat_file_t                    *to_learn;
 	GList                          *cur;
@@ -286,7 +308,7 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
 	                   bayes_error_quark(),		/* error domain */
 	                   1,            				/* error code */
 	                   "message contains too few tokens: %d, while min is %d",
-	                   nodes, minnodes);
+	                   nodes, (int)minnodes);
 			return FALSE;
 		}
 	}
@@ -296,6 +318,14 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb
 	data.now = time (NULL);
 	data.ctx = ctx;
 	data.learned_tokens = 0;
+	data.learned_tokens = 0;
+	if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+		minnodes = parse_limit (value);
+		data.max_tokens = minnodes;
+	}
+	else {
+		data.max_tokens = 0;
+	}
 	cur = ctx->cfg->statfiles;
 	while (cur) {
 		/* Select statfile to learn */
@@ -356,7 +386,8 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 {
 	struct bayes_callback_data      data;
 	gchar                          *value;
-	gint                            nodes, minnodes;
+	gint                            nodes;
+	gsize                           minnodes;
 	struct statfile                *st;
 	stat_file_t                    *file;
 	GList                          *cur;
@@ -375,7 +406,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 					bayes_error_quark(),		/* error domain */
 					1,            				/* error code */
 					"message contains too few tokens: %d, while min is %d",
-					nodes, minnodes);
+					nodes, (int)minnodes);
 			return FALSE;
 		}
 	}
@@ -392,6 +423,15 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 	data.now = time (NULL);
 	data.ctx = ctx;
 
+	data.learned_tokens = 0;
+	if (ctx->cfg->opts && (value = g_hash_table_lookup (ctx->cfg->opts, "max_tokens")) != NULL) {
+		minnodes = parse_limit (value);
+		data.max_tokens = minnodes;
+	}
+	else {
+		data.max_tokens = 0;
+	}
+
 	while (cur) {
 		/* Select statfiles to learn */
 		st = cur->data;
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-07-26 16:57:36 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-07-26 16:57:36 +0400
commit	6eded20b2c15e524ed9a83c436a3b5f0bfbd253c (patch)
tree	b5a5d7bf336a7eaf965d72554c8b51028f9eb0d1 /src/classifiers
parent	8e4282bb260132232c26f3f87f48181044ea6cc3 (diff)
download	rspamd-6eded20b2c15e524ed9a83c436a3b5f0bfbd253c.tar.gz rspamd-6eded20b2c15e524ed9a83c436a3b5f0bfbd253c.zip