]> source.dussan.org Git - rspamd.git/commitdiff
* Add meta-classification example.
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Thu, 4 Oct 2012 18:14:10 +0000 (22:14 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Thu, 4 Oct 2012 18:14:10 +0000 (22:14 +0400)
Many changes to advanced statistic and meta-classification logic.
Add example of complex meta-classification.

conf/lua/rspamd.classifiers.lua
conf/rspamd-basic.xml.in
lib/client/librspamdclient.c
src/classifiers/bayes.c
src/controller.c
src/lua/lua_classifier.c
src/statfile.c

index e158a29a6cffe7f384c043425bf6d4790f0a8858..de1e23506740055c446af471cd0578b3d4fb698f 100644 (file)
@@ -1,5 +1,76 @@
 -- Detect language of message and selects appropriate statfiles for it
 
+-- Common labels for specific statfiles
+local many_recipients_label = 'many recipients'
+local undisclosed_recipients_label = 'undisclosed recipients'
+local list_label = 'maillist'
+local long_subject_label = 'long subject'
+local different_reply_to_label = 'different reply to'
+local has_in_reply_label = 'reply message'
+
+-- Get specific statfiles set based on message rules
+local function get_specific_statfiles(classifier, task)
+       local spec_st = {}
+       -- More 5 recipients
+       local st_many = classifier:get_statfile_by_label(many_recipients_label)
+       if st_many then
+               rcpt = task:get_recipients()
+               if rcpt and table.maxn(rcpt) > 5 then
+                       print(table.maxn(rcpt))
+                       table.foreach(st_many, function(i,v) table.insert(spec_st,v) end)
+               end
+       end
+       -- Undisclosed
+       local st_undisc = classifier:get_statfile_by_label(undisclosed_recipients_label)
+       if st_undisc then
+               rcpt = task:get_recipients()
+               if rcpt and table.maxn(rcpt) == 0 then
+                       table.foreach(st_undisc, function(i,v) table.insert(spec_st,v) end)
+               end
+       end
+       -- Maillist
+       local st_maillist = classifier:get_statfile_by_label(list_label)
+       if st_maillist then
+               local unsub_header = task:get_raw_header('List-Unsubscribe')
+               if unsub_header and unsub_header[1] then
+                       table.foreach(st_maillist, function(i,v) table.insert(spec_st,v) end)
+               end
+       end
+       -- Long subject
+       local st_longsubj = classifier:get_statfile_by_label(long_subject_label)
+       if st_longsubj then
+               local subj = task:get_raw_header('Subject')
+               if subj and subj[1] and string.len(subj[1]['value']) > 150 then
+                       table.foreach(st_longsubj, function(i,v) table.insert(spec_st,v) end)
+               end
+       end
+       -- Reply-To != To
+       local st_replyto = classifier:get_statfile_by_label(different_reply_to_label)
+       if st_replyto then
+               local to = task:get_raw_header('To')
+               local reply_to = task:get_raw_header('Reply-To')
+               if to and to[1] and reply_to and reply_to[1] then
+                       if string.lower(to[1]['value']) ~= string.lower(reply_to[1]['value']) then
+                               table.foreach(st_replyto, function(i,v) table.insert(spec_st,v) end)
+                       end
+               end
+       end
+       -- Has In-Reply-To header
+       local st_reply = classifier:get_statfile_by_label(has_in_reply_label)
+       if st_reply then
+               local inrep_header = task:get_raw_header('In-Reply-To')
+               if inrep_header and inrep_header[1] then
+                       table.foreach(st_reply, function(i,v) table.insert(spec_st,v) end)
+               end
+       end
+       
+       if table.maxn(spec_st) > 1 then
+               return spec_st
+       else
+               return nil
+       end
+end
+
 classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
        -- Subfunction for detection of message's language
        local detect_language = function(task)
@@ -14,35 +85,50 @@ classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
        end
 
        -- Main procedure
+       local selected = {}
+       local spec_st = get_specific_statfiles(classifier, task)
+       if spec_st then
+               if is_learn then
+                       return spec_st
+               else
+                       -- Merge tables
+                       table.foreach(spec_st, function(i,v) table.insert(selected,v) end)
+               end
+       end
+       -- Detect statfile by language
        language = detect_language(task)
        if language then
                -- Find statfiles with specified language
-               local selected = {}
-               for _,st in pairs(classifier:get_statfiles()) do
-                       local st_l = st:get_param('language')
-                       if st_l and st_l == language then
-                           -- Insert statfile with specified language    
-                           table.insert(selected, st)
+               for _,st in ipairs(classifier:get_statfiles()) do
+                       -- Skip labeled statfiles
+                       if not st:get_label() then
+                               local st_l = st:get_param('language')
+                               if st_l and st_l == language then
+                                       -- Insert statfile with specified language    
+                                       table.insert(selected, st)
+                               end
                        end
                end
                if table.maxn(selected) > 1 then
                        return selected
                end
-       else
-               -- Language not detected
-               local selected = {}
-               for _,st in ipairs(classifier:get_statfiles()) do
+       end
+
+       -- Language not detected or specific language statfiles have not been found
+       for _,st in ipairs(classifier:get_statfiles()) do
+               -- Skip labeled statfiles
+               if not st:get_label() then
                        local st_l = st:get_param('language')
                        -- Insert only statfiles without language
                        if not st_l then
                                table.insert(selected, st)
                        end
                end
-               if table.maxn(selected) > 1 then
-                       return selected
-               end
        end
-
+       if table.maxn(selected) > 1 then
+               return selected
+       end
+       
        return nil
 end
 
index 06bd80c03d65ba319d78c9de9db49d8e900032da..fbeee898ae1efa0dfc2988f29500e9c441511a95 100644 (file)
  </statfile>
 </classifier>
 
+<!-- Advanced meta-classification statistic -->
+<!--
+<classifier type="bayes">
+ <tokenizer>osb-text</tokenizer>
+ <metric>default</metric>
+ <min_tokens>6</min_tokens>
+ <max_tokens>5000</max_tokens>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes.ham</path>
+  <spam>no</spam>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes.spam</path>
+  <spam>yes</spam>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM_RU</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_ru.ham</path>
+  <language>ru</language>
+  <spam>no</spam>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_SPAM_RU</symbol>
+  <size>50M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_ru.spam</path>
+  <language>ru</language>
+  <spam>yes</spam>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.spam</path>
+  <spam>yes</spam>
+  <label>many recipients</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.ham</path>
+  <spam>no</spam>
+  <label>many recipients</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.spam</path>
+  <spam>yes</spam>
+  <label>undisclosed recipients</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.ham</path>
+  <spam>no</spam>
+  <label>undisclosed recipients</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_maillist.spam</path>
+  <spam>yes</spam>
+  <label>maillist</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_maillist.ham</path>
+  <spam>no</spam>
+  <label>maillist</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_longsubject.spam</path>
+  <spam>yes</spam>
+  <label>long subject</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_longsubject.ham</path>
+  <spam>no</spam>
+  <label>long subject</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replyto.spam</path>
+  <spam>yes</spam>
+  <label>different reply to</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replyto.ham</path>
+  <spam>no</spam>
+  <label>different reply to</label>
+ </statfile>
+
+ <statfile>
+  <symbol>BAYES_SPAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replymessage.spam</path>
+  <spam>yes</spam>
+  <label>reply message</label>
+ </statfile>
+ <statfile>
+  <symbol>BAYES_HAM</symbol>
+  <size>10M</size>
+  <path>@LOCALSTATES_PREFIX@/bayes_replymessage.ham</path>
+  <spam>no</spam>
+  <label>reply message</label>
+ </statfile>
+</classifier>
+-->
+
 <!-- End of classifiers section -->
 
 <!-- Modules section -->
index 7a073f4104cb7f2d25fe84c389c7f3c76558dae9..0b6cd05d3d2374f03775adf130ae88d34855519b 100644 (file)
@@ -1175,9 +1175,19 @@ rspamd_add_server (struct rspamd_client *client, const gchar *host, guint16 port
 {
        struct rspamd_server           *new;
        struct hostent                 *hent;
-       gint                                                    nlen;
+       gint                                                    nlen, i;
 
        g_assert (client != NULL);
+
+       /* Avoid duplicates */
+       for (i = 0; i < (gint)client->servers_num; i ++) {
+               new = &client->servers[i];
+               if (new->client_port == port && new->controller_port == controller_port && strcmp (host, new->host) == 0) {
+                       /* Duplicate */
+                       return TRUE;
+               }
+       }
+
        if (client->servers_num >= MAX_RSPAMD_SERVERS) {
                if (*err == NULL) {
                        *err = g_error_new (G_RSPAMD_ERROR, 1, "Maximum number of servers reached: %d", MAX_RSPAMD_SERVERS);
index cad963c4b8e384297b3e2a450d875d59d4efdfd4..a80bbe0ba32a7296ca08d9f0e697017904191cdc 100644 (file)
@@ -391,6 +391,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
        struct statfile                *st;
        stat_file_t                    *file;
        GList                          *cur;
+       gboolean                                                skip_labels;
 
        g_assert (pool != NULL);
        g_assert (ctx != NULL);
@@ -411,11 +412,14 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
                }
        }
 
-       cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+       cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
        if (cur) {
+               skip_labels = FALSE;
                memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur);
        }
        else {
+               /* Do not try to learn specific statfiles if pre callback returned nil */
+               skip_labels = TRUE;
                cur = ctx->cfg->statfiles;
        }
 
@@ -435,7 +439,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
        while (cur) {
                /* Select statfiles to learn */
                st = cur->data;
-               if (st->is_spam != is_spam) {
+               if (st->is_spam != is_spam || (skip_labels && st->label)) {
                        cur = g_list_next (cur);
                        continue;
                }
@@ -460,8 +464,6 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
                                        msg_err ("cannot open statfile %s after creation", st->path);
                                        return FALSE;
                                }
-                               cur = g_list_next (cur);
-                               continue;
                        }
                }
                data.file = file;
@@ -470,6 +472,7 @@ bayes_learn_spam (struct classifier_ctx* ctx, statfile_pool_t *pool,
                statfile_inc_revision (file);
                statfile_pool_unlock_file (pool, data.file);
                maybe_write_binlog (ctx->cfg, st, file, input);
+               msg_info ("increase revision for %s", st->path);
 
                cur = g_list_next (cur);
        }
index c987bc15f22c87cf0c58d7ed38420758c58509a7..47d4443174d60b7f24e8435748995fd8e88afeac 100644 (file)
@@ -488,11 +488,20 @@ process_stat_command (struct controller_session *session)
                                total = statfile_get_total_blocks (statfile);
                                statfile_get_revision (statfile, &rev, &ti);
                                if (total != (guint64)-1 && used != (guint64)-1) {
-                                       r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r, 
-                                                       "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
-                                                       st->symbol, rev, st->size,
-                                                       (total - used), total,
-                                                       (double)((double)(total - used) / (double)total) * 100.);
+                                       if (st->label) {
+                                               r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+                                                               "Statfile: %s <%s> (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+                                                               st->symbol, st->label, rev, st->size,
+                                                               (total - used), total,
+                                                               (double)((double)(total - used) / (double)total) * 100.);
+                                       }
+                                       else {
+                                               r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+                                                               "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+                                                               st->symbol, rev, st->size,
+                                                               (total - used), total,
+                                                               (double)((double)(total - used) / (double)total) * 100.);
+                                       }
                                }
                        }
                        cur_st = g_list_next (cur_st);
@@ -1173,8 +1182,6 @@ fin_learn_task (void *arg)
 
        if (task->state != WRITING_REPLY) {
                task->state = WRITE_REPLY;
-               /* Process all statfiles */
-               process_statfiles (task);
        }
 
        /* Check if we have all events finished */
index be18cda0d05a93178311a5c79dd2bf9db3f2a16e..202d29af3afcffcc1308f71394226131a0ee02cb 100644 (file)
@@ -247,19 +247,18 @@ lua_classifier_get_statfiles (lua_State *L)
        struct classifier_config       *ccf = lua_check_classifier (L);
        GList                          *cur;
        struct statfile                *st, **pst;
+       gint                                                    i;
 
        if (ccf) {
                lua_newtable (L);
                cur = g_list_first (ccf->statfiles);
+               i = 1;
                while (cur) {
                        st = cur->data;
-                       /* t['statfile_name'] = statfile */
-                       lua_pushstring (L, st->symbol);
                        pst = lua_newuserdata (L, sizeof (struct statfile *));
                        lua_setclass (L, "rspamd{statfile}", -1);
                        *pst = st;
-
-                       lua_settable (L, -3);
+                       lua_rawseti (L, -2, i++);
 
                        cur = g_list_next (cur);
                }
@@ -388,7 +387,7 @@ lua_statfile_get_param (lua_State *L)
 
        if (st != NULL && param != NULL) {
                value = g_hash_table_lookup (st->opts, param);
-               if (param != NULL) {
+               if (value != NULL) {
                        lua_pushstring (L, value);
                        return 1;
                }
index 3c4674fc9305e5994584ac28c264c1c5a6cbe9fd..15c41550a307970acf657cf085b3d683d30b1522 100644 (file)
@@ -415,7 +415,7 @@ statfile_pool_close (statfile_pool_t * pool, stat_file_t * file, gboolean keep_s
 
        if (file->map) {
                msg_info ("syncing statfile %s", file->filename);
-               msync (file->map, file->len, MS_INVALIDATE | MS_SYNC);
+               msync (file->map, file->len, MS_ASYNC);
                munmap (file->map, file->len);
        }
        if (file->fd != -1) {
@@ -606,7 +606,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
        for (i = 0; i < CHAIN_LENGTH; i++) {
                if (i + blocknum >= file->cur_section.length) {
                        /* Need to expire some block in chain */
-                       msg_debug ("chain %u is full, starting expire", blocknum);
+                       msg_info ("chain %ud is full in statfile %s, starting expire", blocknum, file->filename);
                        break;
                }
                /* First try to find block in chain */
@@ -617,7 +617,7 @@ statfile_pool_set_block_common (statfile_pool_t * pool, stat_file_t * file, guin
                /* Check whether we have a free block in chain */
                if (block->hash1 == 0 && block->hash2 == 0) {
                        /* Write new block here */
-                       msg_debug ("found free block %u in chain %u, set h1=%u, h2=%u", i, blocknum, h1, h2);
+                       msg_debug ("found free block %ud in chain %ud, set h1=%ud, h2=%ud", i, blocknum, h1, h2);
                        block->hash1 = h1;
                        block->hash2 = h2;
                        block->value = value;
@@ -880,12 +880,20 @@ statfile_pool_invalidate_callback (gint fd, short what, void *ud)
 void
 statfile_pool_plan_invalidate (statfile_pool_t *pool, time_t seconds, time_t jitter)
 {
+       gboolean                        pending;
 
-       if (pool->invalidate_event == NULL || ! evtimer_pending (pool->invalidate_event, NULL)) {
 
-               if (pool->invalidate_event == NULL) {
-                       pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
+       if (pool->invalidate_event != NULL) {
+               pending = evtimer_pending (pool->invalidate_event, NULL);
+               if (pending) {
+                       /* Replan event */
+                       pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
+                       pool->invalidate_tv.tv_usec = 0;
+                       evtimer_add (pool->invalidate_event, &pool->invalidate_tv);
                }
+       }
+       else {
+               pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
                pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
                pool->invalidate_tv.tv_usec = 0;
                evtimer_set (pool->invalidate_event, statfile_pool_invalidate_callback, pool);