* Add meta-classification example.

Many changes to advanced statistic and meta-classification logic. Add example of complex meta-classification.
author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2012-10-04 22:14:10 +0400
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2012-10-04 22:14:10 +0400
commit: 3789849b7b2e617d0a287fe77490b6643f3a6b74 (patch)
tree: 8415305aee39a3aad6adbccbc1941a62f3a41cf8
parent: 14e1129068d55bc8de0618832d4f7d33bb1b0f06 (diff)
download: rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.tar.gz
rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.zip
7 files changed, 276 insertions, 37 deletions
diff --git a/conf/lua/rspamd.classifiers.lua b/conf/lua/rspamd.classifiers.lua
index e158a29a6..de1e23506 100644
--- a/conf/lua/rspamd.classifiers.lua
+++ b/conf/lua/rspamd.classifiers.lua
@@ -1,5 +1,76 @@
 -- Detect language of message and selects appropriate statfiles for it
 
+-- Common labels for specific statfiles
+local many_recipients_label = 'many recipients'
+local undisclosed_recipients_label = 'undisclosed recipients'
+local list_label = 'maillist'
+local long_subject_label = 'long subject'
+local different_reply_to_label = 'different reply to'
+local has_in_reply_label = 'reply message'
+
+-- Get specific statfiles set based on message rules
+local function get_specific_statfiles(classifier, task)
+	local spec_st = {}
+	-- More 5 recipients
+	local st_many = classifier:get_statfile_by_label(many_recipients_label)
+	if st_many then
+		rcpt = task:get_recipients()
+		if rcpt and table.maxn(rcpt) > 5 then
+			print(table.maxn(rcpt))
+			table.foreach(st_many, function(i,v) table.insert(spec_st,v) end)
+		end
+	end
+	-- Undisclosed
+	local st_undisc = classifier:get_statfile_by_label(undisclosed_recipients_label)
+	if st_undisc then
+		rcpt = task:get_recipients()
+		if rcpt and table.maxn(rcpt) == 0 then
+			table.foreach(st_undisc, function(i,v) table.insert(spec_st,v) end)
+		end
+	end
+	-- Maillist
+	local st_maillist = classifier:get_statfile_by_label(list_label)
+	if st_maillist then
+		local unsub_header = task:get_raw_header('List-Unsubscribe')
+		if unsub_header and unsub_header[1] then
+			table.foreach(st_maillist, function(i,v) table.insert(spec_st,v) end)
+		end
+	end
+	-- Long subject
+	local st_longsubj = classifier:get_statfile_by_label(long_subject_label)
+	if st_longsubj then
+		local subj = task:get_raw_header('Subject')
+		if subj and subj[1] and string.len(subj[1]['value']) > 150 then
+			table.foreach(st_longsubj, function(i,v) table.insert(spec_st,v) end)
+		end
+	end
+	-- Reply-To != To
+	local st_replyto = classifier:get_statfile_by_label(different_reply_to_label)
+	if st_replyto then
+		local to = task:get_raw_header('To')
+		local reply_to = task:get_raw_header('Reply-To')
+		if to and to[1] and reply_to and reply_to[1] then
+			if string.lower(to[1]['value']) ~= string.lower(reply_to[1]['value']) then
+				table.foreach(st_replyto, function(i,v) table.insert(spec_st,v) end)
+			end
+		end
+	end
+	-- Has In-Reply-To header
+	local st_reply = classifier:get_statfile_by_label(has_in_reply_label)
+	if st_reply then
+		local inrep_header = task:get_raw_header('In-Reply-To')
+		if inrep_header and inrep_header[1] then
+			table.foreach(st_reply, function(i,v) table.insert(spec_st,v) end)
+		end
+	end
+	
+	if table.maxn(spec_st) > 1 then
+		return spec_st
+	else
+		return nil
+	end
+end
+
 classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
 	-- Subfunction for detection of message's language
 	local detect_language = function(task)
@@ -14,35 +85,50 @@ classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
 	end
 
 	-- Main procedure
+	local selected = {}
+	local spec_st = get_specific_statfiles(classifier, task)
+	if spec_st then
+		if is_learn then
+			return spec_st
+		else
+			-- Merge tables
+			table.foreach(spec_st, function(i,v) table.insert(selected,v) end)
+		end
+	end
+	-- Detect statfile by language
 	language = detect_language(task)
 	if language then
 		-- Find statfiles with specified language
-		local selected = {}
-		for _,st in pairs(classifier:get_statfiles()) do
-			local st_l = st:get_param('language')
-			if st_l and st_l == language then
-			    -- Insert statfile with specified language    
-			    table.insert(selected, st)
+		for _,st in ipairs(classifier:get_statfiles()) do
+			-- Skip labeled statfiles
+			if not st:get_label() then
+				local st_l = st:get_param('language')
+				if st_l and st_l == language then
+					-- Insert statfile with specified language    
+					table.insert(selected, st)
+				end
 			end
 		end
 		if table.maxn(selected) > 1 then
 			return selected
 		end
-	else
-		-- Language not detected
-		local selected = {}
-		for _,st in ipairs(classifier:get_statfiles()) do
+	end
+
+	-- Language not detected or specific language statfiles have not been found
+	for _,st in ipairs(classifier:get_statfiles()) do
+		-- Skip labeled statfiles
+		if not st:get_label() then
 			local st_l = st:get_param('language')
 			-- Insert only statfiles without language
 			if not st_l then
 				table.insert(selected, st)
 			end
 		end
-		if table.maxn(selected) > 1 then
-			return selected
-		end
 	end
-
+	if table.maxn(selected) > 1 then
+		return selected
+	end
+	
 	return nil
 end
 
diff --git a/conf/rspamd-basic.xml.in b/conf/rspamd-basic.xml.in
index 06bd80c03..fbeee898a 100644
--- a/conf/rspamd-basic.xml.in
+++ b/conf/rspamd-basic.xml.in
@@ -603,6 +603,132 @@
  </statfile>
 </classifier>
 
+<!-- Advanced meta-classification statistic -->
+<!--
+<classifier type="bayes">
+ <tokenizer>osb-text</tokenizer>
+ <metric>default</metric>
+ <min_tokens>6</min_tokens>
+ <max_tokens>5000</max_tokens>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes.ham</path>
+  <spam>no</spam>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes.spam</path>
+  <spam>yes</spam>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM_RU</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_ru.ham</path>
+  <language>ru</language>
+  <spam>no</spam>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_SPAM_RU</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_ru.spam</path>
+  <language>ru</language>
+  <spam>yes</spam>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.spam</path>
+  <spam>yes</spam>
+  <label>many recipients</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.ham</path>
+  <spam>no</spam>
+  <label>many recipients</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.spam</path>
+  <spam>yes</spam>
+  <label>undisclosed recipients</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.ham</path>
+  <spam>no</spam>
+  <label>undisclosed recipients</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_maillist.spam</path>
+  <spam>yes</spam>
+  <label>maillist</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_maillist.ham</path>
+  <spam>no</spam>
+  <label>maillist</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_longsubject.spam</path>
+  <spam>yes</spam>
+  <label>long subject</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_longsubject.ham</path>
+  <spam>no</spam>
+  <label>long subject</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replyto.spam</path>
+  <spam>yes</spam>
+  <label>different reply to</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replyto.ham</path>
+  <spam>no</spam>
+  <label>different reply to</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replymessage.spam</path>
+  <spam>yes</spam>
+  <label>reply message</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replymessage.ham</path>
+  <spam>no</spam>
+  <label>reply message</label>
+ </statfile>
+</classifier>
+-->
+
 <!-- End of classifiers section -->
 
 <!-- Modules section -->
diff --git a/lib/client/librspamdclient.c b/lib/client/librspamdclient.c
index 7a073f410..0b6cd05d3 100644
--- a/lib/client/librspamdclient.c
+++ b/lib/client/librspamdclient.c
@@ -1175,9 +1175,19 @@ rspamd_add_server (struct rspamd_client *client, const gchar *host, guint16 port
 {
 	struct rspamd_server           *new;
 	struct hostent                 *hent;
-	gint							nlen;
+	gint							nlen, i;
 
 	g_assert (client != NULL);
+
+	/* Avoid duplicates */
+	for (i = 0; i < (gint)client->servers_num; i ++) {
+		new = &client->servers[i];
+		if (new->client_port == port && new->controller_port == controller_port && strcmp (host, new->host) == 0) {
+			/* Duplicate */
+			return TRUE;
+		}
+	}
+
 	if (client->servers_num >= MAX_RSPAMD_SERVERS) {
 		if (*err == NULL) {
 			*err = g_error_new (G_RSPAMD_ERROR, 1, "Maximum number of servers reached: %d", MAX_RSPAMD_SERVERS);
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index cad963c4b..a80bbe0ba 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -391,6 +391,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 	struct statfile                *st;
 	stat_file_t                    *file;
 	GList                          *cur;
+	gboolean						skip_labels;
 
 	g_assert (pool != NULL);
 	g_assert (ctx != NULL);
@@ -411,11 +412,14 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 		}
 	}
 
-	cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+	cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
 	if (cur) {
+		skip_labels = FALSE;
 		memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur);
 	}
 	else {
+		/* Do not try to learn specific statfiles if pre callback returned nil */
+		skip_labels = TRUE;
 		cur = ctx->cfg->statfiles;
 	}
 
@@ -435,7 +439,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 	while (cur) {
 		/* Select statfiles to learn */
 		st = cur->data;
-		if (st->is_spam != is_spam) {
+		if (st->is_spam != is_spam || (skip_labels && st->label)) {
 			cur = g_list_next (cur);
 			continue;
 		}
@@ -460,8 +464,6 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 					msg_err ("cannot open statfile %s after creation", st->path);
 					return FALSE;
 				}
-				cur = g_list_next (cur);
-				continue;
 			}
 		}
 		data.file = file;
@@ -470,6 +472,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
 		statfile_inc_revision (file);
 		statfile_pool_unlock_file (pool, data.file);
 		maybe_write_binlog (ctx->cfg, st, file, input);
+		msg_info ("increase revision for %s", st->path);
 
 		cur = g_list_next (cur);
 	}
diff --git a/src/controller.c b/src/controller.c
index c987bc15f..47d444317 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -488,11 +488,20 @@ process_stat_command (struct controller_session *session)
 				total = statfile_get_total_blocks (statfile);
 				statfile_get_revision (statfile, &rev, &ti);
 				if (total != (guint64)-1 && used != (guint64)-1) {
-					r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r, 
-							"Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
-							st->symbol, rev, st->size,
-							(total - used), total,
-							(double)((double)(total - used) / (double)total) * 100.);
+					if (st->label) {
+						r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+								"Statfile: %s <%s> (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+								st->symbol, st->label, rev, st->size,
+								(total - used), total,
+								(double)((double)(total - used) / (double)total) * 100.);
+					}
+					else {
+						r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+								"Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+								st->symbol, rev, st->size,
+								(total - used), total,
+								(double)((double)(total - used) / (double)total) * 100.);
+					}
 				}
 			}
 			cur_st = g_list_next (cur_st);
@@ -1173,8 +1182,6 @@ fin_learn_task (void *arg)
 
 	if (task->state != WRITING_REPLY) {
 		task->state = WRITE_REPLY;
-		/* Process all statfiles */
-		process_statfiles (task);
 	}
 
 	/* Check if we have all events finished */
diff --git a/src/lua/lua_classifier.c b/src/lua/lua_classifier.c
index be18cda0d..202d29af3 100644
--- a/src/lua/lua_classifier.c
+++ b/src/lua/lua_classifier.c
@@ -247,19 +247,18 @@ lua_classifier_get_statfiles (lua_State *L)
 	struct classifier_config       *ccf = lua_check_classifier (L);
 	GList                          *cur;
 	struct statfile                *st, **pst;
+	gint							i;
 
 	if (ccf) {
 		lua_newtable (L);
 		cur = g_list_first (ccf->statfiles);
+		i = 1;
 		while (cur) {
 			st = cur->data;
-			/* t['statfile_name'] = statfile */
-			lua_pushstring (L, st->symbol);
 			pst = lua_newuserdata (L, sizeof (struct statfile *));
 			lua_setclass (L, "rspamd{statfile}", -1);
 			*pst = st;
-
-			lua_settable (L, -3);
+			lua_rawseti (L, -2, i++);
 
 			cur = g_list_next (cur);
 		}
@@ -388,7 +387,7 @@ lua_statfile_get_param (lua_State *L)
 
 	if (st != NULL && param != NULL) {
 		value = g_hash_table_lookup (st->opts, param);
-		if (param != NULL) {
+		if (value != NULL) {
 			lua_pushstring (L, value);
 			return 1;
 		}
diff --git a/src/statfile.c b/src/statfile.c
index 3c4674fc9..15c41550a 100644
--- a/src/statfile.c
+++ b/src/statfile.c
@@ -415,7 +415,7 @@ statfile_pool_close (statfile_pool_t * pool, stat_file_t * file, gboolean keep_s
 
 	if (file->map) {
 		msg_info ("syncing statfile %s", file->filename);
-		msync (file->map, file->len, MS_INVALIDATE | MS_SYNC);
+		msync (file->map, file->len, MS_ASYNC);
 		munmap (file->map, file->len);
 	}
 	if (file->fd != -1) {
@@ -606,7 +606,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
 	for (i = 0; i < CHAIN_LENGTH; i++) {
 		if (i + blocknum >= file->cur_section.length) {
 			/* Need to expire some block in chain */
-			msg_debug ("chain %u is full, starting expire", blocknum);
+			msg_info ("chain %ud is full in statfile %s, starting expire", blocknum, file->filename);
 			break;
 		}
 		/* First try to find block in chain */
@@ -617,7 +617,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
 		/* Check whether we have a free block in chain */
 		if (block->hash1 == 0 && block->hash2 == 0) {
 			/* Write new block here */
-			msg_debug ("found free block %u in chain %u, set h1=%u, h2=%u", i, blocknum, h1, h2);
+			msg_debug ("found free block %ud in chain %ud, set h1=%ud, h2=%ud", i, blocknum, h1, h2);
 			block->hash1 = h1;
 			block->hash2 = h2;
 			block->value = value;
@@ -880,12 +880,20 @@ statfile_pool_invalidate_callback (gint fd, short what, void *ud)
 void
 statfile_pool_plan_invalidate (statfile_pool_t *pool, time_t seconds, time_t jitter)
 {
+	gboolean                        pending;
 
-	if (pool->invalidate_event == NULL || ! evtimer_pending (pool->invalidate_event, NULL)) {
 
-		if (pool->invalidate_event == NULL) {
-			pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
+	if (pool->invalidate_event != NULL) {
+		pending = evtimer_pending (pool->invalidate_event, NULL);
+		if (pending) {
+			/* Replan event */
+			pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
+			pool->invalidate_tv.tv_usec = 0;
+			evtimer_add (pool->invalidate_event, &pool->invalidate_tv);
 		}
+	}
+	else {
+		pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
 		pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
 		pool->invalidate_tv.tv_usec = 0;
 		evtimer_set (pool->invalidate_event, statfile_pool_invalidate_callback, pool);
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2012-10-04 22:14:10 +0400
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2012-10-04 22:14:10 +0400
commit	3789849b7b2e617d0a287fe77490b6643f3a6b74 (patch)
tree	8415305aee39a3aad6adbccbc1941a62f3a41cf8
parent	14e1129068d55bc8de0618832d4f7d33bb1b0f06 (diff)
download	rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.tar.gz rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.zip