summaryrefslogtreecommitdiffstats
path: root/src/filter.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/filter.c')
-rw-r--r--src/filter.c83
1 files changed, 83 insertions, 0 deletions
diff --git a/src/filter.c b/src/filter.c
index fea91125f..66f233115 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -944,6 +944,89 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
return TRUE;
}
+gboolean
+learn_task_spam (struct classifier_config *cl, struct worker_task *task, gboolean is_spam, GError **err)
+{
+ GList *cur, *ex;
+ struct classifier_ctx *cls_ctx;
+ f_str_t c;
+ GTree *tokens = NULL;
+ struct mime_text_part *part;
+ gboolean is_utf = FALSE, is_twopart = FALSE;
+
+ cur = g_list_first (task->text_parts);
+ if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
+ is_twopart = TRUE;
+ }
+
+ /* Get tokens from each element */
+ while (cur) {
+ part = cur->data;
+ /* Skip empty parts */
+ if (part->is_empty) {
+ cur = g_list_next (cur);
+ continue;
+ }
+ c.begin = part->content->data;
+ c.len = part->content->len;
+ is_utf = part->is_utf;
+ ex = part->urls_offset;
+ if (is_twopart && cur->next == NULL) {
+ /* Compare part's content */
+ if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
+ /* Get tokens */
+ if (!cl->tokenizer->tokenize_func (
+ cl->tokenizer, task->task_pool,
+ &c, &tokens, FALSE, is_utf, ex)) {
+ g_set_error (err, filter_error_quark(), 2, "Cannot tokenize message");
+ return FALSE;
+ }
+ cur = g_list_next (cur);
+ }
+
+ /* Handle messages without text */
+ if (tokens == NULL) {
+ g_set_error (err, filter_error_quark(), 3, "Cannot tokenize message, no text data");
+ msg_info ("learn failed for message <%s>, no tokens to extract", task->message_id);
+ return FALSE;
+ }
+
+ /* Take care of subject */
+ tokenize_subject (task, &tokens);
+
+ /* Init classifier */
+ cls_ctx = cl->classifier->init_func (
+ task->task_pool, cl);
+ /* Learn */
+ if (!cl->classifier->learn_spam_func (
+ cls_ctx, task->worker->srv->statfile_pool,
+ tokens, task, is_spam, err)) {
+ if (*err) {
+ msg_info ("learn failed for message <%s>, learn error: %s", task->message_id, (*err)->message);
+ return FALSE;
+ }
+ else {
+ g_set_error (err, filter_error_quark(), 4, "Learn failed, unknown learn classifier error");
+ msg_info ("learn failed for message <%s>, unknown learn error", task->message_id);
+ return FALSE;
+ }
+ }
+ /* Increase statistics */
+ task->worker->srv->stat->messages_learned++;
+
+ msg_info ("learn success for message <%s>",
+ task->message_id);
+ statfile_pool_plan_invalidate (task->worker->srv->statfile_pool,
+ DEFAULT_STATFILE_INVALIDATE_TIME,
+ DEFAULT_STATFILE_INVALIDATE_JITTER);
+
+ return TRUE;
+}
+
/*
* vi:ts=4
*/