aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-04 22:14:10 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-04 22:14:10 +0400
commit3789849b7b2e617d0a287fe77490b6643f3a6b74 (patch)
tree8415305aee39a3aad6adbccbc1941a62f3a41cf8
parent14e1129068d55bc8de0618832d4f7d33bb1b0f06 (diff)
downloadrspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.tar.gz
rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.zip
* Add meta-classification example.
Many changes to advanced statistic and meta-classification logic. Add example of complex meta-classification.
-rw-r--r--conf/lua/rspamd.classifiers.lua114
-rw-r--r--conf/rspamd-basic.xml.in126
-rw-r--r--lib/client/librspamdclient.c12
-rw-r--r--src/classifiers/bayes.c11
-rw-r--r--src/controller.c21
-rw-r--r--src/lua/lua_classifier.c9
-rw-r--r--src/statfile.c20
7 files changed, 276 insertions, 37 deletions
diff --git a/conf/lua/rspamd.classifiers.lua b/conf/lua/rspamd.classifiers.lua
index e158a29a6..de1e23506 100644
--- a/conf/lua/rspamd.classifiers.lua
+++ b/conf/lua/rspamd.classifiers.lua
@@ -1,5 +1,76 @@
-- Detect language of message and selects appropriate statfiles for it
+-- Common labels for specific statfiles
+local many_recipients_label = 'many recipients'
+local undisclosed_recipients_label = 'undisclosed recipients'
+local list_label = 'maillist'
+local long_subject_label = 'long subject'
+local different_reply_to_label = 'different reply to'
+local has_in_reply_label = 'reply message'
+
+-- Get specific statfiles set based on message rules
+local function get_specific_statfiles(classifier, task)
+ local spec_st = {}
+ -- More 5 recipients
+ local st_many = classifier:get_statfile_by_label(many_recipients_label)
+ if st_many then
+ rcpt = task:get_recipients()
+ if rcpt and table.maxn(rcpt) > 5 then
+ print(table.maxn(rcpt))
+ table.foreach(st_many, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Undisclosed
+ local st_undisc = classifier:get_statfile_by_label(undisclosed_recipients_label)
+ if st_undisc then
+ rcpt = task:get_recipients()
+ if rcpt and table.maxn(rcpt) == 0 then
+ table.foreach(st_undisc, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Maillist
+ local st_maillist = classifier:get_statfile_by_label(list_label)
+ if st_maillist then
+ local unsub_header = task:get_raw_header('List-Unsubscribe')
+ if unsub_header and unsub_header[1] then
+ table.foreach(st_maillist, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Long subject
+ local st_longsubj = classifier:get_statfile_by_label(long_subject_label)
+ if st_longsubj then
+ local subj = task:get_raw_header('Subject')
+ if subj and subj[1] and string.len(subj[1]['value']) > 150 then
+ table.foreach(st_longsubj, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Reply-To != To
+ local st_replyto = classifier:get_statfile_by_label(different_reply_to_label)
+ if st_replyto then
+ local to = task:get_raw_header('To')
+ local reply_to = task:get_raw_header('Reply-To')
+ if to and to[1] and reply_to and reply_to[1] then
+ if string.lower(to[1]['value']) ~= string.lower(reply_to[1]['value']) then
+ table.foreach(st_replyto, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ end
+ -- Has In-Reply-To header
+ local st_reply = classifier:get_statfile_by_label(has_in_reply_label)
+ if st_reply then
+ local inrep_header = task:get_raw_header('In-Reply-To')
+ if inrep_header and inrep_header[1] then
+ table.foreach(st_reply, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+
+ if table.maxn(spec_st) > 1 then
+ return spec_st
+ else
+ return nil
+ end
+end
+
classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
-- Subfunction for detection of message's language
local detect_language = function(task)
@@ -14,35 +85,50 @@ classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
end
-- Main procedure
+ local selected = {}
+ local spec_st = get_specific_statfiles(classifier, task)
+ if spec_st then
+ if is_learn then
+ return spec_st
+ else
+ -- Merge tables
+ table.foreach(spec_st, function(i,v) table.insert(selected,v) end)
+ end
+ end
+ -- Detect statfile by language
language = detect_language(task)
if language then
-- Find statfiles with specified language
- local selected = {}
- for _,st in pairs(classifier:get_statfiles()) do
- local st_l = st:get_param('language')
- if st_l and st_l == language then
- -- Insert statfile with specified language
- table.insert(selected, st)
+ for _,st in ipairs(classifier:get_statfiles()) do
+ -- Skip labeled statfiles
+ if not st:get_label() then
+ local st_l = st:get_param('language')
+ if st_l and st_l == language then
+ -- Insert statfile with specified language
+ table.insert(selected, st)
+ end
end
end
if table.maxn(selected) > 1 then
return selected
end
- else
- -- Language not detected
- local selected = {}
- for _,st in ipairs(classifier:get_statfiles()) do
+ end
+
+ -- Language not detected or specific language statfiles have not been found
+ for _,st in ipairs(classifier:get_statfiles()) do
+ -- Skip labeled statfiles
+ if not st:get_label() then
local st_l = st:get_param('language')
-- Insert only statfiles without language
if not st_l then
table.insert(selected, st)
end
end
- if table.maxn(selected) > 1 then
- return selected
- end
end
-
+ if table.maxn(selected) > 1 then
+ return selected
+ end
+
return nil
end
diff --git a/conf/rspamd-basic.xml.in b/conf/rspamd-basic.xml.in
index 06bd80c03..fbeee898a 100644
--- a/conf/rspamd-basic.xml.in
+++ b/conf/rspamd-basic.xml.in
@@ -603,6 +603,132 @@
</statfile>
</classifier>
+<!-- Advanced meta-classification statistic -->
+<!--
+<classifier type="bayes">
+ <tokenizer>osb-text</tokenizer>
+ <metric>default</metric>
+ <min_tokens>6</min_tokens>
+ <max_tokens>5000</max_tokens>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes.ham</path>
+ <spam>no</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes.spam</path>
+ <spam>yes</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM_RU</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_ru.ham</path>
+ <language>ru</language>
+ <spam>no</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_SPAM_RU</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_ru.spam</path>
+ <language>ru</language>
+ <spam>yes</spam>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.spam</path>
+ <spam>yes</spam>
+ <label>many recipients</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.ham</path>
+ <spam>no</spam>
+ <label>many recipients</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.spam</path>
+ <spam>yes</spam>
+ <label>undisclosed recipients</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.ham</path>
+ <spam>no</spam>
+ <label>undisclosed recipients</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_maillist.spam</path>
+ <spam>yes</spam>
+ <label>maillist</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_maillist.ham</path>
+ <spam>no</spam>
+ <label>maillist</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_longsubject.spam</path>
+ <spam>yes</spam>
+ <label>long subject</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_longsubject.ham</path>
+ <spam>no</spam>
+ <label>long subject</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replyto.spam</path>
+ <spam>yes</spam>
+ <label>different reply to</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replyto.ham</path>
+ <spam>no</spam>
+ <label>different reply to</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replymessage.spam</path>
+ <spam>yes</spam>
+ <label>reply message</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replymessage.ham</path>
+ <spam>no</spam>
+ <label>reply message</label>
+ </statfile>
+</classifier>
+-->
+
<!-- End of classifiers section -->
<!-- Modules section -->
diff --git a/lib/client/librspamdclient.c b/lib/client/librspamdclient.c
index 7a073f410..0b6cd05d3 100644
--- a/lib/client/librspamdclient.c
+++ b/lib/client/librspamdclient.c
@@ -1175,9 +1175,19 @@ rspamd_add_server (struct rspamd_client *client, const gchar *host, guint16 port
{
struct rspamd_server *new;
struct hostent *hent;
- gint nlen;
+ gint nlen, i;
g_assert (client != NULL);
+
+ /* Avoid duplicates */
+ for (i = 0; i < (gint)client->servers_num; i ++) {
+ new = &client->servers[i];
+ if (new->client_port == port && new->controller_port == controller_port && strcmp (host, new->host) == 0) {
+ /* Duplicate */
+ return TRUE;
+ }
+ }
+
if (client->servers_num >= MAX_RSPAMD_SERVERS) {
if (*err == NULL) {
*err = g_error_new (G_RSPAMD_ERROR, 1, "Maximum number of servers reached: %d", MAX_RSPAMD_SERVERS);
diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c
index cad963c4b..a80bbe0ba 100644
--- a/src/classifiers/bayes.c
+++ b/src/classifiers/bayes.c
@@ -391,6 +391,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
struct statfile *st;
stat_file_t *file;
GList *cur;
+ gboolean skip_labels;
g_assert (pool != NULL);
g_assert (ctx != NULL);
@@ -411,11 +412,14 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
}
}
- cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+ cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
if (cur) {
+ skip_labels = FALSE;
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur);
}
else {
+ /* Do not try to learn specific statfiles if pre callback returned nil */
+ skip_labels = TRUE;
cur = ctx->cfg->statfiles;
}
@@ -435,7 +439,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
while (cur) {
/* Select statfiles to learn */
st = cur->data;
- if (st->is_spam != is_spam) {
+ if (st->is_spam != is_spam || (skip_labels && st->label)) {
cur = g_list_next (cur);
continue;
}
@@ -460,8 +464,6 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
msg_err ("cannot open statfile %s after creation", st->path);
return FALSE;
}
- cur = g_list_next (cur);
- continue;
}
}
data.file = file;
@@ -470,6 +472,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
statfile_inc_revision (file);
statfile_pool_unlock_file (pool, data.file);
maybe_write_binlog (ctx->cfg, st, file, input);
+ msg_info ("increase revision for %s", st->path);
cur = g_list_next (cur);
}
diff --git a/src/controller.c b/src/controller.c
index c987bc15f..47d444317 100644
--- a/src/controller.c
+++ b/src/controller.c
@@ -488,11 +488,20 @@ process_stat_command (struct controller_session *session)
total = statfile_get_total_blocks (statfile);
statfile_get_revision (statfile, &rev, &ti);
if (total != (guint64)-1 && used != (guint64)-1) {
- r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
- "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
- st->symbol, rev, st->size,
- (total - used), total,
- (double)((double)(total - used) / (double)total) * 100.);
+ if (st->label) {
+ r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+ "Statfile: %s <%s> (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+ st->symbol, st->label, rev, st->size,
+ (total - used), total,
+ (double)((double)(total - used) / (double)total) * 100.);
+ }
+ else {
+ r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+ "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+ st->symbol, rev, st->size,
+ (total - used), total,
+ (double)((double)(total - used) / (double)total) * 100.);
+ }
}
}
cur_st = g_list_next (cur_st);
@@ -1173,8 +1182,6 @@ fin_learn_task (void *arg)
if (task->state != WRITING_REPLY) {
task->state = WRITE_REPLY;
- /* Process all statfiles */
- process_statfiles (task);
}
/* Check if we have all events finished */
diff --git a/src/lua/lua_classifier.c b/src/lua/lua_classifier.c
index be18cda0d..202d29af3 100644
--- a/src/lua/lua_classifier.c
+++ b/src/lua/lua_classifier.c
@@ -247,19 +247,18 @@ lua_classifier_get_statfiles (lua_State *L)
struct classifier_config *ccf = lua_check_classifier (L);
GList *cur;
struct statfile *st, **pst;
+ gint i;
if (ccf) {
lua_newtable (L);
cur = g_list_first (ccf->statfiles);
+ i = 1;
while (cur) {
st = cur->data;
- /* t['statfile_name'] = statfile */
- lua_pushstring (L, st->symbol);
pst = lua_newuserdata (L, sizeof (struct statfile *));
lua_setclass (L, "rspamd{statfile}", -1);
*pst = st;
-
- lua_settable (L, -3);
+ lua_rawseti (L, -2, i++);
cur = g_list_next (cur);
}
@@ -388,7 +387,7 @@ lua_statfile_get_param (lua_State *L)
if (st != NULL && param != NULL) {
value = g_hash_table_lookup (st->opts, param);
- if (param != NULL) {
+ if (value != NULL) {
lua_pushstring (L, value);
return 1;
}
diff --git a/src/statfile.c b/src/statfile.c
index 3c4674fc9..15c41550a 100644
--- a/src/statfile.c
+++ b/src/statfile.c
@@ -415,7 +415,7 @@ statfile_pool_close (statfile_pool_t * pool, stat_file_t * file, gboolean keep_s
if (file->map) {
msg_info ("syncing statfile %s", file->filename);
- msync (file->map, file->len, MS_INVALIDATE | MS_SYNC);
+ msync (file->map, file->len, MS_ASYNC);
munmap (file->map, file->len);
}
if (file->fd != -1) {
@@ -606,7 +606,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
for (i = 0; i < CHAIN_LENGTH; i++) {
if (i + blocknum >= file->cur_section.length) {
/* Need to expire some block in chain */
- msg_debug ("chain %u is full, starting expire", blocknum);
+ msg_info ("chain %ud is full in statfile %s, starting expire", blocknum, file->filename);
break;
}
/* First try to find block in chain */
@@ -617,7 +617,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
/* Check whether we have a free block in chain */
if (block->hash1 == 0 && block->hash2 == 0) {
/* Write new block here */
- msg_debug ("found free block %u in chain %u, set h1=%u, h2=%u", i, blocknum, h1, h2);
+ msg_debug ("found free block %ud in chain %ud, set h1=%ud, h2=%ud", i, blocknum, h1, h2);
block->hash1 = h1;
block->hash2 = h2;
block->value = value;
@@ -880,12 +880,20 @@ statfile_pool_invalidate_callback (gint fd, short what, void *ud)
void
statfile_pool_plan_invalidate (statfile_pool_t *pool, time_t seconds, time_t jitter)
{
+ gboolean pending;
- if (pool->invalidate_event == NULL || ! evtimer_pending (pool->invalidate_event, NULL)) {
- if (pool->invalidate_event == NULL) {
- pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
+ if (pool->invalidate_event != NULL) {
+ pending = evtimer_pending (pool->invalidate_event, NULL);
+ if (pending) {
+ /* Replan event */
+ pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
+ pool->invalidate_tv.tv_usec = 0;
+ evtimer_add (pool->invalidate_event, &pool->invalidate_tv);
}
+ }
+ else {
+ pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
pool->invalidate_tv.tv_usec = 0;
evtimer_set (pool->invalidate_event, statfile_pool_invalidate_callback, pool);