aboutsummaryrefslogtreecommitdiffstats
path: root/conf
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-04 22:14:10 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2012-10-04 22:14:10 +0400
commit3789849b7b2e617d0a287fe77490b6643f3a6b74 (patch)
tree8415305aee39a3aad6adbccbc1941a62f3a41cf8 /conf
parent14e1129068d55bc8de0618832d4f7d33bb1b0f06 (diff)
downloadrspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.tar.gz
rspamd-3789849b7b2e617d0a287fe77490b6643f3a6b74.zip
* Add meta-classification example.
Many changes to advanced statistic and meta-classification logic. Add example of complex meta-classification.
Diffstat (limited to 'conf')
-rw-r--r--conf/lua/rspamd.classifiers.lua114
-rw-r--r--conf/rspamd-basic.xml.in126
2 files changed, 226 insertions, 14 deletions
diff --git a/conf/lua/rspamd.classifiers.lua b/conf/lua/rspamd.classifiers.lua
index e158a29a6..de1e23506 100644
--- a/conf/lua/rspamd.classifiers.lua
+++ b/conf/lua/rspamd.classifiers.lua
@@ -1,5 +1,76 @@
-- Detect language of message and selects appropriate statfiles for it
+-- Common labels for specific statfiles
+local many_recipients_label = 'many recipients'
+local undisclosed_recipients_label = 'undisclosed recipients'
+local list_label = 'maillist'
+local long_subject_label = 'long subject'
+local different_reply_to_label = 'different reply to'
+local has_in_reply_label = 'reply message'
+
+-- Get specific statfiles set based on message rules
+local function get_specific_statfiles(classifier, task)
+ local spec_st = {}
+ -- More 5 recipients
+ local st_many = classifier:get_statfile_by_label(many_recipients_label)
+ if st_many then
+ rcpt = task:get_recipients()
+ if rcpt and table.maxn(rcpt) > 5 then
+ print(table.maxn(rcpt))
+ table.foreach(st_many, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Undisclosed
+ local st_undisc = classifier:get_statfile_by_label(undisclosed_recipients_label)
+ if st_undisc then
+ rcpt = task:get_recipients()
+ if rcpt and table.maxn(rcpt) == 0 then
+ table.foreach(st_undisc, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Maillist
+ local st_maillist = classifier:get_statfile_by_label(list_label)
+ if st_maillist then
+ local unsub_header = task:get_raw_header('List-Unsubscribe')
+ if unsub_header and unsub_header[1] then
+ table.foreach(st_maillist, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Long subject
+ local st_longsubj = classifier:get_statfile_by_label(long_subject_label)
+ if st_longsubj then
+ local subj = task:get_raw_header('Subject')
+ if subj and subj[1] and string.len(subj[1]['value']) > 150 then
+ table.foreach(st_longsubj, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Reply-To != To
+ local st_replyto = classifier:get_statfile_by_label(different_reply_to_label)
+ if st_replyto then
+ local to = task:get_raw_header('To')
+ local reply_to = task:get_raw_header('Reply-To')
+ if to and to[1] and reply_to and reply_to[1] then
+ if string.lower(to[1]['value']) ~= string.lower(reply_to[1]['value']) then
+ table.foreach(st_replyto, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ end
+ -- Has In-Reply-To header
+ local st_reply = classifier:get_statfile_by_label(has_in_reply_label)
+ if st_reply then
+ local inrep_header = task:get_raw_header('In-Reply-To')
+ if inrep_header and inrep_header[1] then
+ table.foreach(st_reply, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+
+ if table.maxn(spec_st) > 1 then
+ return spec_st
+ else
+ return nil
+ end
+end
+
classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
-- Subfunction for detection of message's language
local detect_language = function(task)
@@ -14,35 +85,50 @@ classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
end
-- Main procedure
+ local selected = {}
+ local spec_st = get_specific_statfiles(classifier, task)
+ if spec_st then
+ if is_learn then
+ return spec_st
+ else
+ -- Merge tables
+ table.foreach(spec_st, function(i,v) table.insert(selected,v) end)
+ end
+ end
+ -- Detect statfile by language
language = detect_language(task)
if language then
-- Find statfiles with specified language
- local selected = {}
- for _,st in pairs(classifier:get_statfiles()) do
- local st_l = st:get_param('language')
- if st_l and st_l == language then
- -- Insert statfile with specified language
- table.insert(selected, st)
+ for _,st in ipairs(classifier:get_statfiles()) do
+ -- Skip labeled statfiles
+ if not st:get_label() then
+ local st_l = st:get_param('language')
+ if st_l and st_l == language then
+ -- Insert statfile with specified language
+ table.insert(selected, st)
+ end
end
end
if table.maxn(selected) > 1 then
return selected
end
- else
- -- Language not detected
- local selected = {}
- for _,st in ipairs(classifier:get_statfiles()) do
+ end
+
+ -- Language not detected or specific language statfiles have not been found
+ for _,st in ipairs(classifier:get_statfiles()) do
+ -- Skip labeled statfiles
+ if not st:get_label() then
local st_l = st:get_param('language')
-- Insert only statfiles without language
if not st_l then
table.insert(selected, st)
end
end
- if table.maxn(selected) > 1 then
- return selected
- end
end
-
+ if table.maxn(selected) > 1 then
+ return selected
+ end
+
return nil
end
diff --git a/conf/rspamd-basic.xml.in b/conf/rspamd-basic.xml.in
index 06bd80c03..fbeee898a 100644
--- a/conf/rspamd-basic.xml.in
+++ b/conf/rspamd-basic.xml.in
@@ -603,6 +603,132 @@
</statfile>
</classifier>
+<!-- Advanced meta-classification statistic -->
+<!--
+<classifier type="bayes">
+ <tokenizer>osb-text</tokenizer>
+ <metric>default</metric>
+ <min_tokens>6</min_tokens>
+ <max_tokens>5000</max_tokens>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes.ham</path>
+ <spam>no</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes.spam</path>
+ <spam>yes</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM_RU</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_ru.ham</path>
+ <language>ru</language>
+ <spam>no</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_SPAM_RU</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_ru.spam</path>
+ <language>ru</language>
+ <spam>yes</spam>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.spam</path>
+ <spam>yes</spam>
+ <label>many recipients</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.ham</path>
+ <spam>no</spam>
+ <label>many recipients</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.spam</path>
+ <spam>yes</spam>
+ <label>undisclosed recipients</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.ham</path>
+ <spam>no</spam>
+ <label>undisclosed recipients</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_maillist.spam</path>
+ <spam>yes</spam>
+ <label>maillist</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_maillist.ham</path>
+ <spam>no</spam>
+ <label>maillist</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_longsubject.spam</path>
+ <spam>yes</spam>
+ <label>long subject</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_longsubject.ham</path>
+ <spam>no</spam>
+ <label>long subject</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replyto.spam</path>
+ <spam>yes</spam>
+ <label>different reply to</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replyto.ham</path>
+ <spam>no</spam>
+ <label>different reply to</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replymessage.spam</path>
+ <spam>yes</spam>
+ <label>reply message</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replymessage.ham</path>
+ <spam>no</spam>
+ <label>reply message</label>
+ </statfile>
+</classifier>
+-->
+
<!-- End of classifiers section -->
<!-- Modules section -->