Many changes to advanced statistic and meta-classification logic.
Add example of complex meta-classification.
-- Detect language of message and selects appropriate statfiles for it
+-- Common labels for specific statfiles
+local many_recipients_label = 'many recipients'
+local undisclosed_recipients_label = 'undisclosed recipients'
+local list_label = 'maillist'
+local long_subject_label = 'long subject'
+local different_reply_to_label = 'different reply to'
+local has_in_reply_label = 'reply message'
+
+-- Get specific statfiles set based on message rules
+local function get_specific_statfiles(classifier, task)
+ local spec_st = {}
+ -- More 5 recipients
+ local st_many = classifier:get_statfile_by_label(many_recipients_label)
+ if st_many then
+ rcpt = task:get_recipients()
+ if rcpt and table.maxn(rcpt) > 5 then
+ print(table.maxn(rcpt))
+ table.foreach(st_many, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Undisclosed
+ local st_undisc = classifier:get_statfile_by_label(undisclosed_recipients_label)
+ if st_undisc then
+ rcpt = task:get_recipients()
+ if rcpt and table.maxn(rcpt) == 0 then
+ table.foreach(st_undisc, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Maillist
+ local st_maillist = classifier:get_statfile_by_label(list_label)
+ if st_maillist then
+ local unsub_header = task:get_raw_header('List-Unsubscribe')
+ if unsub_header and unsub_header[1] then
+ table.foreach(st_maillist, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Long subject
+ local st_longsubj = classifier:get_statfile_by_label(long_subject_label)
+ if st_longsubj then
+ local subj = task:get_raw_header('Subject')
+ if subj and subj[1] and string.len(subj[1]['value']) > 150 then
+ table.foreach(st_longsubj, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ -- Reply-To != To
+ local st_replyto = classifier:get_statfile_by_label(different_reply_to_label)
+ if st_replyto then
+ local to = task:get_raw_header('To')
+ local reply_to = task:get_raw_header('Reply-To')
+ if to and to[1] and reply_to and reply_to[1] then
+ if string.lower(to[1]['value']) ~= string.lower(reply_to[1]['value']) then
+ table.foreach(st_replyto, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+ end
+ -- Has In-Reply-To header
+ local st_reply = classifier:get_statfile_by_label(has_in_reply_label)
+ if st_reply then
+ local inrep_header = task:get_raw_header('In-Reply-To')
+ if inrep_header and inrep_header[1] then
+ table.foreach(st_reply, function(i,v) table.insert(spec_st,v) end)
+ end
+ end
+
+ if table.maxn(spec_st) > 1 then
+ return spec_st
+ else
+ return nil
+ end
+end
+
classifiers['bayes'] = function(classifier, task, is_learn, is_spam)
-- Subfunction for detection of message's language
local detect_language = function(task)
end
-- Main procedure
+ local selected = {}
+ local spec_st = get_specific_statfiles(classifier, task)
+ if spec_st then
+ if is_learn then
+ return spec_st
+ else
+ -- Merge tables
+ table.foreach(spec_st, function(i,v) table.insert(selected,v) end)
+ end
+ end
+ -- Detect statfile by language
language = detect_language(task)
if language then
-- Find statfiles with specified language
- local selected = {}
- for _,st in pairs(classifier:get_statfiles()) do
- local st_l = st:get_param('language')
- if st_l and st_l == language then
- -- Insert statfile with specified language
- table.insert(selected, st)
+ for _,st in ipairs(classifier:get_statfiles()) do
+ -- Skip labeled statfiles
+ if not st:get_label() then
+ local st_l = st:get_param('language')
+ if st_l and st_l == language then
+ -- Insert statfile with specified language
+ table.insert(selected, st)
+ end
end
end
if table.maxn(selected) > 1 then
return selected
end
- else
- -- Language not detected
- local selected = {}
- for _,st in ipairs(classifier:get_statfiles()) do
+ end
+
+ -- Language not detected or specific language statfiles have not been found
+ for _,st in ipairs(classifier:get_statfiles()) do
+ -- Skip labeled statfiles
+ if not st:get_label() then
local st_l = st:get_param('language')
-- Insert only statfiles without language
if not st_l then
table.insert(selected, st)
end
end
- if table.maxn(selected) > 1 then
- return selected
- end
end
-
+ if table.maxn(selected) > 1 then
+ return selected
+ end
+
return nil
end
</statfile>
</classifier>
+<!-- Advanced meta-classification statistic -->
+<!--
+<classifier type="bayes">
+ <tokenizer>osb-text</tokenizer>
+ <metric>default</metric>
+ <min_tokens>6</min_tokens>
+ <max_tokens>5000</max_tokens>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes.ham</path>
+ <spam>no</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes.spam</path>
+ <spam>yes</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM_RU</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_ru.ham</path>
+ <language>ru</language>
+ <spam>no</spam>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_SPAM_RU</symbol>
+ <size>50M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_ru.spam</path>
+ <language>ru</language>
+ <spam>yes</spam>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.spam</path>
+ <spam>yes</spam>
+ <label>many recipients</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_manyrcpt.ham</path>
+ <spam>no</spam>
+ <label>many recipients</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.spam</path>
+ <spam>yes</spam>
+ <label>undisclosed recipients</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_undisclosedrcpt.ham</path>
+ <spam>no</spam>
+ <label>undisclosed recipients</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_maillist.spam</path>
+ <spam>yes</spam>
+ <label>maillist</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_maillist.ham</path>
+ <spam>no</spam>
+ <label>maillist</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_longsubject.spam</path>
+ <spam>yes</spam>
+ <label>long subject</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_longsubject.ham</path>
+ <spam>no</spam>
+ <label>long subject</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replyto.spam</path>
+ <spam>yes</spam>
+ <label>different reply to</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replyto.ham</path>
+ <spam>no</spam>
+ <label>different reply to</label>
+ </statfile>
+
+ <statfile>
+ <symbol>BAYES_SPAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replymessage.spam</path>
+ <spam>yes</spam>
+ <label>reply message</label>
+ </statfile>
+ <statfile>
+ <symbol>BAYES_HAM</symbol>
+ <size>10M</size>
+ <path>@LOCALSTATES_PREFIX@/bayes_replymessage.ham</path>
+ <spam>no</spam>
+ <label>reply message</label>
+ </statfile>
+</classifier>
+-->
+
<!-- End of classifiers section -->
<!-- Modules section -->
{
struct rspamd_server *new;
struct hostent *hent;
- gint nlen;
+ gint nlen, i;
g_assert (client != NULL);
+
+ /* Avoid duplicates */
+ for (i = 0; i < (gint)client->servers_num; i ++) {
+ new = &client->servers[i];
+ if (new->client_port == port && new->controller_port == controller_port && strcmp (host, new->host) == 0) {
+ /* Duplicate */
+ return TRUE;
+ }
+ }
+
if (client->servers_num >= MAX_RSPAMD_SERVERS) {
if (*err == NULL) {
*err = g_error_new (G_RSPAMD_ERROR, 1, "Maximum number of servers reached: %d", MAX_RSPAMD_SERVERS);
struct statfile *st;
stat_file_t *file;
GList *cur;
+ gboolean skip_labels;
g_assert (pool != NULL);
g_assert (ctx != NULL);
}
}
- cur = call_classifier_pre_callbacks (ctx->cfg, task, FALSE, FALSE, L);
+ cur = call_classifier_pre_callbacks (ctx->cfg, task, TRUE, is_spam, L);
if (cur) {
+ skip_labels = FALSE;
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_list_free, cur);
}
else {
+ /* Do not try to learn specific statfiles if pre callback returned nil */
+ skip_labels = TRUE;
cur = ctx->cfg->statfiles;
}
while (cur) {
/* Select statfiles to learn */
st = cur->data;
- if (st->is_spam != is_spam) {
+ if (st->is_spam != is_spam || (skip_labels && st->label)) {
cur = g_list_next (cur);
continue;
}
msg_err ("cannot open statfile %s after creation", st->path);
return FALSE;
}
- cur = g_list_next (cur);
- continue;
}
}
data.file = file;
statfile_inc_revision (file);
statfile_pool_unlock_file (pool, data.file);
maybe_write_binlog (ctx->cfg, st, file, input);
+ msg_info ("increase revision for %s", st->path);
cur = g_list_next (cur);
}
total = statfile_get_total_blocks (statfile);
statfile_get_revision (statfile, &rev, &ti);
if (total != (guint64)-1 && used != (guint64)-1) {
- r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
- "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
- st->symbol, rev, st->size,
- (total - used), total,
- (double)((double)(total - used) / (double)total) * 100.);
+ if (st->label) {
+ r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+ "Statfile: %s <%s> (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+ st->symbol, st->label, rev, st->size,
+ (total - used), total,
+ (double)((double)(total - used) / (double)total) * 100.);
+ }
+ else {
+ r += rspamd_snprintf (out_buf + r, sizeof (out_buf) - r,
+ "Statfile: %s (version %uL); length: %Hz; free blocks: %uL; total blocks: %uL; free: %.2f%%" CRLF,
+ st->symbol, rev, st->size,
+ (total - used), total,
+ (double)((double)(total - used) / (double)total) * 100.);
+ }
}
}
cur_st = g_list_next (cur_st);
if (task->state != WRITING_REPLY) {
task->state = WRITE_REPLY;
- /* Process all statfiles */
- process_statfiles (task);
}
/* Check if we have all events finished */
struct classifier_config *ccf = lua_check_classifier (L);
GList *cur;
struct statfile *st, **pst;
+ gint i;
if (ccf) {
lua_newtable (L);
cur = g_list_first (ccf->statfiles);
+ i = 1;
while (cur) {
st = cur->data;
- /* t['statfile_name'] = statfile */
- lua_pushstring (L, st->symbol);
pst = lua_newuserdata (L, sizeof (struct statfile *));
lua_setclass (L, "rspamd{statfile}", -1);
*pst = st;
-
- lua_settable (L, -3);
+ lua_rawseti (L, -2, i++);
cur = g_list_next (cur);
}
if (st != NULL && param != NULL) {
value = g_hash_table_lookup (st->opts, param);
- if (param != NULL) {
+ if (value != NULL) {
lua_pushstring (L, value);
return 1;
}
if (file->map) {
msg_info ("syncing statfile %s", file->filename);
- msync (file->map, file->len, MS_INVALIDATE | MS_SYNC);
+ msync (file->map, file->len, MS_ASYNC);
munmap (file->map, file->len);
}
if (file->fd != -1) {
for (i = 0; i < CHAIN_LENGTH; i++) {
if (i + blocknum >= file->cur_section.length) {
/* Need to expire some block in chain */
- msg_debug ("chain %u is full, starting expire", blocknum);
+ msg_info ("chain %ud is full in statfile %s, starting expire", blocknum, file->filename);
break;
}
/* First try to find block in chain */
/* Check whether we have a free block in chain */
if (block->hash1 == 0 && block->hash2 == 0) {
/* Write new block here */
- msg_debug ("found free block %u in chain %u, set h1=%u, h2=%u", i, blocknum, h1, h2);
+ msg_debug ("found free block %ud in chain %ud, set h1=%ud, h2=%ud", i, blocknum, h1, h2);
block->hash1 = h1;
block->hash2 = h2;
block->value = value;
void
statfile_pool_plan_invalidate (statfile_pool_t *pool, time_t seconds, time_t jitter)
{
+ gboolean pending;
- if (pool->invalidate_event == NULL || ! evtimer_pending (pool->invalidate_event, NULL)) {
- if (pool->invalidate_event == NULL) {
- pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
+ if (pool->invalidate_event != NULL) {
+ pending = evtimer_pending (pool->invalidate_event, NULL);
+ if (pending) {
+ /* Replan event */
+ pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
+ pool->invalidate_tv.tv_usec = 0;
+ evtimer_add (pool->invalidate_event, &pool->invalidate_tv);
}
+ }
+ else {
+ pool->invalidate_event = memory_pool_alloc (pool->pool, sizeof (struct event));
pool->invalidate_tv.tv_sec = seconds + g_random_int_range (0, jitter);
pool->invalidate_tv.tv_usec = 0;
evtimer_set (pool->invalidate_event, statfile_pool_invalidate_callback, pool);