diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2012-10-04 22:14:10 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2012-10-04 22:14:10 +0400 |
commit | 3789849b7b2e617d0a287fe77490b6643f3a6b74 (patch) | |
tree | 8415305aee39a3aad6adbccbc1941a62f3a41cf8 | |
parent | 14e1129068d55bc8de0618832d4f7d33bb1b0f06 (diff) | |
download | rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.tar.gz rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.zip |
* Add meta-classification example.
Many changes to advanced statistic and meta-classification logic.
Add example of complex meta-classification.
-rw-r--r-- | conf/lua/rspamd.classifiers.lua | 114 | ||||
-rw-r--r-- | conf/rspamd-basic.xml.in | 126 | ||||
-rw-r--r-- | lib/client/librspamdclient.c | 12 | ||||
-rw-r--r-- | src/classifiers/bayes.c | 11 | ||||
-rw-r--r-- | src/controller.c | 21 | ||||
-rw-r--r-- | src/lua/lua_classifier.c | 9 | ||||
-rw-r--r-- | src/statfile.c | 20 |
7 files changed, 276 insertions, 37 deletions
diff --git a/conf/lua/rspamd.classifiers.lua b/conf/lua/rspamd.classifiers.lua index e158a29a6..de1e23506 100644 --- a/conf/lua/rspamd.classifiers.lua +++ b/conf/lua/rspamd.classifiers.lua @@ -1,5 +1,76 @@ -- Detect language of message and selects appropriate statfiles for it +-- Common labels for specific statfiles +local many_recipients_label = 'many recipients' +local undisclosed_recipients_label = 'undisclosed recipients' +local list_label = 'maillist' +local long_subject_label = 'long subject' +local different_reply_to_label = 'different reply to' +local has_in_reply_label = 'reply message' + +-- Get specific statfiles set based on message rules +local function get_specific_statfiles(classifier, task) + local spec_st = {} + -- More 5 recipients + local st_many = classifier:get_statfile_by_label(many_recipients_label) + if st_many then + rcpt = task:get_recipients() + if rcpt and table.maxn(rcpt) > 5 then + print(table.maxn(rcpt)) + table.foreach(st_many, function(i,v) table.insert(spec_st,v) end) + end + end + -- Undisclosed + local st_undisc = classifier:get_statfile_by_label(undisclosed_recipients_label) + if st_undisc then + rcpt = task:get_recipients() + if rcpt and table.maxn(rcpt) == 0 then + table.foreach(st_undisc, function(i,v) table.insert(spec_st,v) end) + end + end + -- Maillist + local st_maillist = classifier:get_statfile_by_label(list_label) + if st_maillist then + local unsub_header = task:get_raw_header('List-Unsubscribe') + if unsub_header and unsub_header[1] then + table.foreach(st_maillist, function(i,v) table.insert(spec_st,v) end) + end + end + -- Long subject + local st_longsubj = classifier:get_statfile_by_label(long_subject_label) + if st_longsubj then + local subj = task:get_raw_header('Subject') + if subj and subj[1] and string.len(subj[1]['value']) > 150 then + table.foreach(st_longsubj, function(i,v) table.insert(spec_st,v) end) + end + end + -- Reply-To != To + local st_replyto = classifier:get_statfile_by_label(different_reply_to_label) + if st_replyto then + local to = task:get_raw_header('To') + local reply_to = task:get_raw_header('Reply-To') + if to and to[1] and reply_to and reply_to[1] then + if string.lower(to[1]['value']) ~= string.lower(reply_to[1]['value']) then + table.foreach(st_replyto, function(i,v) table.insert(spec_st,v) end) + end + end + end + -- Has In-Reply-To header + local st_reply = classifier:get_statfile_by_label(has_in_reply_label) + if st_reply then + local inrep_header = task:get_raw_header('In-Reply-To') + if inrep_header and inrep_header[1] then + table.foreach(st_reply, function(i,v) table.insert(spec_st,v) end) + end + end + + if table.maxn(spec_st) > 1 then + return spec_st + else + return nil + end +end + classifiers['bayes'] = function(classifier, task, is_learn, is_spam) -- Subfunction for detection of message's language local detect_language = function(task) @@ -14,35 +85,50 @@ classifiers['bayes'] = function(classifier, task, is_learn, is_spam) end -- Main procedure + local selected = {} + local spec_st = get_specific_statfiles(classifier, task) + if spec_st then + if is_learn then + return spec_st + else + -- Merge tables + table.foreach(spec_st, function(i,v) table.insert(selected,v) end) + end + end + -- Detect statfile by language language = detect_language(task) if language then -- Find statfiles with specified language - local selected = {} - for _,st in pairs(classifier:get_statfiles()) do - local st_l = st:get_param('language') - if st_l and st_l == language then - -- Insert statfile with specified language - table.insert(selected, st) + for _,st in ipairs(classifier:get_statfiles()) do + -- Skip labeled statfiles + if not st:get_label() then + local st_l = st:get_param('language') + if st_l and st_l == language then + -- Insert statfile with specified language + table.insert(selected, st) + end end end if table.maxn(selected) > 1 then return selected end - else - -- Language not detected - local selected = {} - for _,st in ipairs(classifier:get_statfiles()) do + end + + -- Language not detected or specific language statfiles have not been found + for _,st in ipairs(classifier:get_statfiles()) do + -- Skip labeled statfiles + if not st:get_label() then local st_l = st:get_param('language') -- Insert only statfiles without language if not st_l then table.insert(selected, st) end end - if table.maxn(selected) > 1 then - return selected - end end - + if table.maxn(selected) > 1 then + return selected + end + return nil end diff --git a/conf/rspamd-basic.xml.in b/conf/rspamd-basic.xml.in index 06bd80c03..fbeee898a 100644 --- a/conf/rspamd-basic.xml.in +++ b/conf/rspamd-basic.xml.in @@ -603,6 +603,132 @@ </statfile> </classifier> +<!-- Advanced meta-classification statistic --> +<!-- +<classifier type="bayes"> + <tokenizer>osb-text</tokenizer> + <metric>default</metric> + <min_tokens>6</min_tokens> + <max_tokens>5000</max_tokens> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes.ham</path> + <spam>no</spam> + </statfile> + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes.spam</path> + <spam>yes</spam> + </statfile> + <statfile> + <symbol>BAYES_HAM_RU</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes_ru.ham</path> + <language>ru</language> + <spam>no</spam> + </statfile> + <statfile> + <symbol>BAYES_SPAM_RU</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes_ru.spam</path> + <language>ru</language> + <spam>yes</spam> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.spam</path> + <spam>yes</spam> + <label>many recipients</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.ham</path> + <spam>no</spam> + <label>many recipients</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.spam</path> + <spam>yes</spam> + <label>undisclosed recipients</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.ham</path> + <spam>no</spam> + <label>undisclosed recipients</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_maillist.spam</path> + <spam>yes</spam> + <label>maillist</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_maillist.ham</path> + <spam>no</spam> + <label>maillist</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_longsubject.spam</path> + <spam>yes</spam> + <label>long subject</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_longsubject.ham</path> + <spam>no</spam> + <label>long subject</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replyto.spam</path> + <spam>yes</spam> + <label>different reply to</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replyto.ham</path> + <spam>no</spam> + <label>different reply to</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replymessage.spam</path> + <spam>yes</spam> + <label>reply message</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replymessage.ham</path> + <spam>no</spam> + <label>reply message</label> + </statfile> +</classifier> +--> + <!-- End of classifiers section --> <!-- Modules section --> diff --git a/lib/client/librspamdclient.c b/lib/client/librspamdclient.c index 7a073f410..0b6cd05d3 100644 --- a/lib/client/librspamdclient.c +++ b/lib/client/librspamdclient.c @@ -1175,9 +1175,19 @@ rspamd_add_server (struct rspamd_client *client, const gchar *host, guint16 port { struct rspamd_server *new; struct hostent *hent; - gint nlen; + gint nlen, i; g_assert (client != NULL); + + /* Avoid duplicates */ + for (i = 0; i < (gint)client->servers_num; i ++) { + new = &client->servers[i]; + if (new->client_port == port && new->controller_port == controller_port && strcmp (host, new->host) == 0) { + /* Duplicate */ + return TRUE; + } + } + if (client->servers_num >= MAX_RSPAMD_SERVERS) { if (*err == NULL) { *err = g_error_new (G_RSPAMD_ERROR, 1, "Maximum number of servers reached: %d", MAX_RSPAMD_SERVERS); diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index cad963c4b..a80bbe0ba 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -391,6 +391,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, struct statfile *st; stat_file_t *file; GList *cur; + gboolean skip_labels; g_assert (pool != NULL); g_assert (ctx != NULL); @@ -411,11 +412,14 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, } } - cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L); + cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L); if (cur) { + skip_labels = FALSE; memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur); } else { + /* Do not try to learn specific statfiles if pre callback returned nil */ + skip_labels = TRUE; cur = ctx->cfg->statfiles; } @@ -435,7 +439,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, while (cur) { /* Select statfiles to learn */ st = cur->data; - if (st->is_spam != is_spam) { + if (st->is_spam != is_spam || (skip_labels && st->label)) { cur = g_list_next (cur); continue; } @@ -460,8 +464,6 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, msg_err ("cannot open statfile %s after creation", st->path); return FALSE; } - cur = g_list_next (cur); - continue; } } data.file = file; @@ -470,6 +472,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool, statfile_inc_revision (file); statfile_pool_unlock_file (pool, data.file); maybe_write_binlog (ctx->cfg, st, file, input); + msg_info ("increase revision for %s", st->path); cur = g_list_next (cur); } diff --git a/src/controller.c b/src/controller.c index c987bc15f..47d444317 100644 --- a/src/controller.c +++ b/src/controller.c @@ -488,11 +488,20 @@ process_stat_command (struct controller_session *session) total = statfile_get_total_blocks (statfile); statfile_get_revision (statfile, &rev, &ti); if (total != (guint64)-1 && used != (guint64)-1) { - r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r, - "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF, - st->symbol, rev, st->size, - (total - used), total, - (double)((double)(total - used) / (double)total) * 100.); + if (st->label) { + r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r, + "Statfile: %s <%s> (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF, + st->symbol, st->label, rev, st->size, + (total - used), total, + (double)((double)(total - used) / (double)total) * 100.); + } + else { + r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r, + "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF, + st->symbol, rev, st->size, + (total - used), total, + (double)((double)(total - used) / (double)total) * 100.); + } } } cur_st = g_list_next (cur_st); @@ -1173,8 +1182,6 @@ fin_learn_task (void *arg) if (task->state != WRITING_REPLY) { task->state = WRITE_REPLY; - /* Process all statfiles */ - process_statfiles (task); } /* Check if we have all events finished */ diff --git a/src/lua/lua_classifier.c b/src/lua/lua_classifier.c index be18cda0d..202d29af3 100644 --- a/src/lua/lua_classifier.c +++ b/src/lua/lua_classifier.c @@ -247,19 +247,18 @@ lua_classifier_get_statfiles (lua_State *L) struct classifier_config *ccf = lua_check_classifier (L); GList *cur; struct statfile *st, **pst; + gint i; if (ccf) { lua_newtable (L); cur = g_list_first (ccf->statfiles); + i = 1; while (cur) { st = cur->data; - /* t['statfile_name'] = statfile */ - lua_pushstring (L, st->symbol); pst = lua_newuserdata (L, sizeof (struct statfile *)); lua_setclass (L, "rspamd{statfile}", -1); *pst = st; - - lua_settable (L, -3); + lua_rawseti (L, -2, i++); cur = g_list_next (cur); } @@ -388,7 +387,7 @@ lua_statfile_get_param (lua_State *L) if (st != NULL && param != NULL) { value = g_hash_table_lookup (st->opts, param); - if (param != NULL) { + if (value != NULL) { lua_pushstring (L, value); return 1; } diff --git a/src/statfile.c b/src/statfile.c index 3c4674fc9..15c41550a 100644 --- a/src/statfile.c +++ b/src/statfile.c @@ -415,7 +415,7 @@ statfile_pool_close (statfile_pool_t * pool, stat_file_t * file, gboolean keep_s if (file->map) { msg_info ("syncing statfile %s", file->filename); - msync (file->map, file->len, MS_INVALIDATE | MS_SYNC); + msync (file->map, file->len, MS_ASYNC); munmap (file->map, file->len); } if (file->fd != -1) { @@ -606,7 +606,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin for (i = 0; i < CHAIN_LENGTH; i++) { if (i + blocknum >= file->cur_section.length) { /* Need to expire some block in chain */ - msg_debug ("chain %u is full, starting expire", blocknum); + msg_info ("chain %ud is full in statfile %s, starting expire", blocknum, file->filename); break; } /* First try to find block in chain */ @@ -617,7 +617,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin /* Check whether we have a free block in chain */ if (block->hash1 == 0 && block->hash2 == 0) { /* Write new block here */ - msg_debug ("found free block %u in chain %u, set h1=%u, h2=%u", i, blocknum, h1, h2); + msg_debug ("found free block %ud in chain %ud, set h1=%ud, h2=%ud", i, blocknum, h1, h2); block->hash1 = h1; block->hash2 = h2; block->value = value; @@ -880,12 +880,20 @@ statfile_pool_invalidate_callback (gint fd, short what, void *ud) void statfile_pool_plan_invalidate (statfile_pool_t *pool, time_t seconds, time_t jitter) { + gboolean pending; - if (pool->invalidate_event == NULL || ! evtimer_pending (pool->invalidate_event, NULL)) { - if (pool->invalidate_event == NULL) { - pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event)); + if (pool->invalidate_event != NULL) { + pending = evtimer_pending (pool->invalidate_event, NULL); + if (pending) { + /* Replan event */ + pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter); + pool->invalidate_tv.tv_usec = 0; + evtimer_add (pool->invalidate_event, &pool->invalidate_tv); } + } + else { + pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event)); pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter); pool->invalidate_tv.tv_usec = 0; evtimer_set (pool->invalidate_event, statfile_pool_invalidate_callback, pool); |