diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2012-10-04 22:14:10 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2012-10-04 22:14:10 +0400 |
commit | 3789849b7b2e617d0a287fe77490b6643f3a6b74 (patch) | |
tree | 8415305aee39a3aad6adbccbc1941a62f3a41cf8 /conf | |
parent | 14e1129068d55bc8de0618832d4f7d33bb1b0f06 (diff) | |
download | rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.tar.gz rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.zip |
* Add meta-classification example.
Many changes to advanced statistic and meta-classification logic.
Add example of complex meta-classification.
Diffstat (limited to 'conf')
-rw-r--r-- | conf/lua/rspamd.classifiers.lua | 114 | ||||
-rw-r--r-- | conf/rspamd-basic.xml.in | 126 |
2 files changed, 226 insertions, 14 deletions
diff --git a/conf/lua/rspamd.classifiers.lua b/conf/lua/rspamd.classifiers.lua index e158a29a6..de1e23506 100644 --- a/conf/lua/rspamd.classifiers.lua +++ b/conf/lua/rspamd.classifiers.lua @@ -1,5 +1,76 @@ -- Detect language of message and selects appropriate statfiles for it +-- Common labels for specific statfiles +local many_recipients_label = 'many recipients' +local undisclosed_recipients_label = 'undisclosed recipients' +local list_label = 'maillist' +local long_subject_label = 'long subject' +local different_reply_to_label = 'different reply to' +local has_in_reply_label = 'reply message' + +-- Get specific statfiles set based on message rules +local function get_specific_statfiles(classifier, task) + local spec_st = {} + -- More 5 recipients + local st_many = classifier:get_statfile_by_label(many_recipients_label) + if st_many then + rcpt = task:get_recipients() + if rcpt and table.maxn(rcpt) > 5 then + print(table.maxn(rcpt)) + table.foreach(st_many, function(i,v) table.insert(spec_st,v) end) + end + end + -- Undisclosed + local st_undisc = classifier:get_statfile_by_label(undisclosed_recipients_label) + if st_undisc then + rcpt = task:get_recipients() + if rcpt and table.maxn(rcpt) == 0 then + table.foreach(st_undisc, function(i,v) table.insert(spec_st,v) end) + end + end + -- Maillist + local st_maillist = classifier:get_statfile_by_label(list_label) + if st_maillist then + local unsub_header = task:get_raw_header('List-Unsubscribe') + if unsub_header and unsub_header[1] then + table.foreach(st_maillist, function(i,v) table.insert(spec_st,v) end) + end + end + -- Long subject + local st_longsubj = classifier:get_statfile_by_label(long_subject_label) + if st_longsubj then + local subj = task:get_raw_header('Subject') + if subj and subj[1] and string.len(subj[1]['value']) > 150 then + table.foreach(st_longsubj, function(i,v) table.insert(spec_st,v) end) + end + end + -- Reply-To != To + local st_replyto = classifier:get_statfile_by_label(different_reply_to_label) + if st_replyto then + local to = task:get_raw_header('To') + local reply_to = task:get_raw_header('Reply-To') + if to and to[1] and reply_to and reply_to[1] then + if string.lower(to[1]['value']) ~= string.lower(reply_to[1]['value']) then + table.foreach(st_replyto, function(i,v) table.insert(spec_st,v) end) + end + end + end + -- Has In-Reply-To header + local st_reply = classifier:get_statfile_by_label(has_in_reply_label) + if st_reply then + local inrep_header = task:get_raw_header('In-Reply-To') + if inrep_header and inrep_header[1] then + table.foreach(st_reply, function(i,v) table.insert(spec_st,v) end) + end + end + + if table.maxn(spec_st) > 1 then + return spec_st + else + return nil + end +end + classifiers['bayes'] = function(classifier, task, is_learn, is_spam) -- Subfunction for detection of message's language local detect_language = function(task) @@ -14,35 +85,50 @@ classifiers['bayes'] = function(classifier, task, is_learn, is_spam) end -- Main procedure + local selected = {} + local spec_st = get_specific_statfiles(classifier, task) + if spec_st then + if is_learn then + return spec_st + else + -- Merge tables + table.foreach(spec_st, function(i,v) table.insert(selected,v) end) + end + end + -- Detect statfile by language language = detect_language(task) if language then -- Find statfiles with specified language - local selected = {} - for _,st in pairs(classifier:get_statfiles()) do - local st_l = st:get_param('language') - if st_l and st_l == language then - -- Insert statfile with specified language - table.insert(selected, st) + for _,st in ipairs(classifier:get_statfiles()) do + -- Skip labeled statfiles + if not st:get_label() then + local st_l = st:get_param('language') + if st_l and st_l == language then + -- Insert statfile with specified language + table.insert(selected, st) + end end end if table.maxn(selected) > 1 then return selected end - else - -- Language not detected - local selected = {} - for _,st in ipairs(classifier:get_statfiles()) do + end + + -- Language not detected or specific language statfiles have not been found + for _,st in ipairs(classifier:get_statfiles()) do + -- Skip labeled statfiles + if not st:get_label() then local st_l = st:get_param('language') -- Insert only statfiles without language if not st_l then table.insert(selected, st) end end - if table.maxn(selected) > 1 then - return selected - end end - + if table.maxn(selected) > 1 then + return selected + end + return nil end diff --git a/conf/rspamd-basic.xml.in b/conf/rspamd-basic.xml.in index 06bd80c03..fbeee898a 100644 --- a/conf/rspamd-basic.xml.in +++ b/conf/rspamd-basic.xml.in @@ -603,6 +603,132 @@ </statfile> </classifier> +<!-- Advanced meta-classification statistic --> +<!-- +<classifier type="bayes"> + <tokenizer>osb-text</tokenizer> + <metric>default</metric> + <min_tokens>6</min_tokens> + <max_tokens>5000</max_tokens> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes.ham</path> + <spam>no</spam> + </statfile> + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes.spam</path> + <spam>yes</spam> + </statfile> + <statfile> + <symbol>BAYES_HAM_RU</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes_ru.ham</path> + <language>ru</language> + <spam>no</spam> + </statfile> + <statfile> + <symbol>BAYES_SPAM_RU</symbol> + <size>50M</size> + <path>@LOCALSTATES_PREFIX@/bayes_ru.spam</path> + <language>ru</language> + <spam>yes</spam> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.spam</path> + <spam>yes</spam> + <label>many recipients</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.ham</path> + <spam>no</spam> + <label>many recipients</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.spam</path> + <spam>yes</spam> + <label>undisclosed recipients</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.ham</path> + <spam>no</spam> + <label>undisclosed recipients</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_maillist.spam</path> + <spam>yes</spam> + <label>maillist</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_maillist.ham</path> + <spam>no</spam> + <label>maillist</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_longsubject.spam</path> + <spam>yes</spam> + <label>long subject</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_longsubject.ham</path> + <spam>no</spam> + <label>long subject</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replyto.spam</path> + <spam>yes</spam> + <label>different reply to</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replyto.ham</path> + <spam>no</spam> + <label>different reply to</label> + </statfile> + + <statfile> + <symbol>BAYES_SPAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replymessage.spam</path> + <spam>yes</spam> + <label>reply message</label> + </statfile> + <statfile> + <symbol>BAYES_HAM</symbol> + <size>10M</size> + <path>@LOCALSTATES_PREFIX@/bayes_replymessage.ham</path> + <spam>no</spam> + <label>reply message</label> + </statfile> +</classifier> +--> + <!-- End of classifiers section --> <!-- Modules section --> |