diff options
Diffstat (limited to 'src/libstat/stat_process.c')
-rw-r--r-- | src/libstat/stat_process.c | 366 |
1 files changed, 76 insertions, 290 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index ca51d7b02..d097e12e0 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -32,273 +32,85 @@ static const gdouble similarity_treshold = 80.0; static void -rspamd_stat_tokenize_header (struct rspamd_task *task, - const gchar *name, const gchar *prefix, GArray *ar) -{ - struct rspamd_mime_header *cur; - GPtrArray *hdrs; - guint i; - rspamd_stat_token_t str; - - hdrs = g_hash_table_lookup (task->raw_headers, name); - str.flags = RSPAMD_STAT_TOKEN_FLAG_META; - - if (hdrs != NULL) { - - PTR_ARRAY_FOREACH (hdrs, i, cur) { - if (cur->name != NULL) { - str.begin = cur->name; - str.len = strlen (cur->name); - g_array_append_val (ar, str); - } - if (cur->decoded != NULL) { - str.begin = cur->decoded; - str.len = strlen (cur->decoded); - g_array_append_val (ar, str); - } - else if (cur->value != NULL) { - str.begin = cur->value; - str.len = strlen (cur->value); - g_array_append_val (ar, str); - } - } - - msg_debug_task ("added stat tokens for header '%s'", name); - } -} - -static void rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx, struct rspamd_task *task) { - struct rspamd_image *img; - struct rspamd_mime_part *part; - struct rspamd_mime_text_part *tp; - GList *cur; GArray *ar; rspamd_stat_token_t elt; guint i; - gchar tmpbuf[128]; lua_State *L = task->cfg->lua_state; - const gchar *headers_hash; - struct rspamd_mime_header *hdr; ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16); + memset (&elt, 0, sizeof (elt)); elt.flags = RSPAMD_STAT_TOKEN_FLAG_META; - /* Insert images */ - for (i = 0; i < task->parts->len; i ++) { - part = g_ptr_array_index (task->parts, i); - - if ((part->flags & RSPAMD_MIME_PART_IMAGE) && part->specific.img) { - img = part->specific.img; - - /* If an image has a linked HTML part, then we push its details to the stat */ - if (img->html_image) { - elt.begin = (gchar *)"image"; - elt.len = 5; - g_array_append_val (ar, elt); - elt.begin = (gchar *)&img->html_image->height; - elt.len = sizeof (img->html_image->height); - g_array_append_val (ar, elt); - elt.begin = (gchar *)&img->html_image->width; - elt.len = sizeof (img->html_image->width); - g_array_append_val (ar, elt); - elt.begin = (gchar *)&img->type; - elt.len = sizeof (img->type); - g_array_append_val (ar, elt); - - if (img->filename) { - elt.begin = (gchar *)img->filename; - elt.len = strlen (elt.begin); - g_array_append_val (ar, elt); - } + if (st_ctx->lua_stat_tokens_ref != -1) { + gint err_idx, ret; + GString *tb; + struct rspamd_task **ptask; - msg_debug_task ("added stat tokens for image '%s'", img->html_image->src); - } - } - else if (part->cd && part->cd->filename.len > 0) { - elt.begin = (gchar *)part->cd->filename.begin; - elt.len = part->cd->filename.len; - g_array_append_val (ar, elt); - } - } + lua_pushcfunction (L, &rspamd_lua_traceback); + err_idx = lua_gettop (L); + lua_rawgeti (L, LUA_REGISTRYINDEX, st_ctx->lua_stat_tokens_ref); - /* Process mime parts */ - for (i = 0; i < task->parts->len; i ++) { - part = g_ptr_array_index (task->parts, i); + ptask = lua_newuserdata (L, sizeof (*ptask)); + *ptask = task; + rspamd_lua_setclass (L, "rspamd{task}", -1); - if (IS_CT_MULTIPART (part->ct)) { - elt.begin = (gchar *)part->ct->boundary.begin; - elt.len = part->ct->boundary.len; + if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) { + tb = lua_touserdata (L, -1); + msg_err_task ("call to stat_tokens lua " + "script failed (%d): %v", ret, tb); - if (elt.len) { - msg_debug_task ("added stat tokens for mime boundary '%*s'", - (gint)elt.len, elt.begin); - g_array_append_val (ar, elt); + if (tb) { + g_string_free (tb, TRUE); } - - if (part->parsed_data.len > 1) { - rspamd_snprintf (tmpbuf, sizeof (tmpbuf), "mime%d:%dlog", - i, (gint)log2 (part->parsed_data.len)); - elt.begin = rspamd_mempool_strdup (task->task_pool, tmpbuf); - elt.len = strlen (elt.begin); - g_array_append_val (ar, elt); - } - } - } - - /* Process text parts metadata */ - for (i = 0; i < task->text_parts->len; i ++) { - tp = g_ptr_array_index (task->text_parts, i); - - if (tp->language != NULL && tp->language[0] != '\0') { - elt.begin = (gchar *)tp->language; - elt.len = strlen (elt.begin); - msg_debug_task ("added stat tokens for part language '%s'", elt.begin); - g_array_append_val (ar, elt); - } - if (tp->real_charset != NULL) { - elt.begin = (gchar *)tp->real_charset; - elt.len = strlen (elt.begin); - msg_debug_task ("added stat tokens for part charset '%s'", elt.begin); - g_array_append_val (ar, elt); - } - } - - cur = g_list_first (task->cfg->classify_headers); - - while (cur) { - rspamd_stat_tokenize_header (task, cur->data, "UA:", ar); - - cur = g_list_next (cur); - } - - /* Use headers order */ - headers_hash = rspamd_mempool_get_variable (task->task_pool, - RSPAMD_MEMPOOL_HEADERS_HASH); - - if (headers_hash) { - elt.begin = (gchar *)headers_hash; - elt.len = 16; - g_array_append_val (ar, elt); - } - - /* Use more precise headers order */ - cur = g_list_first (task->headers_order->head); - while (cur) { - hdr = cur->data; - - if (hdr->name && hdr->type != RSPAMD_HEADER_RECEIVED) { - elt.begin = hdr->name; - elt.len = strlen (hdr->name); - g_array_append_val (ar, elt); } + else { + if (lua_type (L, -1) != LUA_TTABLE) { + msg_err_task ("stat_tokens invocation must return " + "table and not %s", + lua_typename (L, lua_type (L, -1))); + } + else { + guint vlen; + rspamd_ftok_t tok; - cur = g_list_next (cur); - } - - /* Use metatokens plugin from Lua */ - lua_getglobal (L, "rspamd_plugins"); - - if (lua_type (L, -1) == LUA_TTABLE) { - lua_pushstring (L, "stat_metatokens"); - lua_gettable (L, -2); - - if (lua_type (L, -1) == LUA_TTABLE) { - gint old_top; + vlen = rspamd_lua_table_size (L, -1); - old_top = lua_gettop (L); - lua_pushstring (L, "callback"); - lua_gettable (L, -2); + for (i = 0; i < vlen; i ++) { + lua_rawgeti (L, -1, i + 1); + tok.begin = lua_tolstring (L, -1, &tok.len); - if (lua_type (L, -1) == LUA_TFUNCTION) { - struct rspamd_task **ptask; + if (tok.begin && tok.len > 0) { + elt.original.begin = + rspamd_mempool_ftokdup (task->task_pool, &tok); + elt.original.len = tok.len; + elt.stemmed.begin = elt.original.begin; + elt.stemmed.len = elt.original.len; + elt.normalized.begin = elt.original.begin; + elt.normalized.len = elt.original.len; - ptask = lua_newuserdata (L, sizeof (*ptask)); - rspamd_lua_setclass (L, "rspamd{task}", -1); - *ptask = task; + g_array_append_val (ar, elt); + } - if (lua_pcall (L, 1, LUA_MULTRET, 0) != 0) { - msg_err_task ("stat_metatokens failed: %s", - lua_tostring (L, -1)); lua_pop (L, 1); - } else { - if (lua_gettop (L) > old_top && - lua_istable (L, old_top + 1)) { - lua_pushvalue (L, old_top + 1); - /* Iterate over table of tables */ - for (lua_pushnil (L); lua_next (L, -2); - lua_pop (L, 1)) { - elt.flags = RSPAMD_STAT_TOKEN_FLAG_META| - RSPAMD_STAT_TOKEN_FLAG_LUA_META; - - if (lua_isnumber (L, -1)) { - gdouble num = lua_tonumber (L, -1); - guint8 *pnum = rspamd_mempool_alloc ( - task->task_pool, - sizeof (num)); - - msg_debug_task ("got metatoken number: %.2f", - num); - memcpy (pnum, &num, sizeof (num)); - elt.begin = (gchar *) pnum; - elt.len = sizeof (num); - g_array_append_val (ar, elt); - } else if (lua_isstring (L, -1)) { - const gchar *str; - gsize tlen; - - str = lua_tolstring (L, -1, &tlen); - guint8 *pstr = rspamd_mempool_alloc ( - task->task_pool, - tlen); - memcpy (pstr, str, tlen); - - msg_debug_task ("got metatoken string: %*s", - (gint) tlen, str); - elt.begin = (gchar *) pstr; - elt.len = tlen; - g_array_append_val (ar, elt); - } - else if (lua_istable (L, -1)) { - /* Treat that as unigramms */ - for (lua_pushnil (L); lua_next (L, -2); - lua_pop (L, 1)) { - if (lua_isstring (L, -1)) { - const gchar *str; - gsize tlen; - - str = lua_tolstring (L, -1, &tlen); - guint8 *pstr = rspamd_mempool_alloc ( - task->task_pool, - tlen); - memcpy (pstr, str, tlen); - - msg_debug_task ("got unigramm " - "metatoken string: %*s", - (gint) tlen, str); - elt.begin = (gchar *) pstr; - elt.len = tlen; - elt.flags |= RSPAMD_STAT_TOKEN_FLAG_UNIGRAM; - g_array_append_val (ar, elt); - } - } - } - } - } } } } + + lua_settop (L, 0); } - lua_settop (L, 0); - st_ctx->tokenizer->tokenize_func (st_ctx, - task->task_pool, - ar, - TRUE, - "META:", - task->tokens); + + if (ar->len > 0) { + st_ctx->tokenizer->tokenize_func (st_ctx, + task, + ar, + TRUE, + "M", + task->tokens); + } rspamd_mempool_add_destructor (task->task_pool, rspamd_array_free_hard, ar); @@ -313,10 +125,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, { struct rspamd_mime_text_part *part; rspamd_cryptobox_hash_state_t hst; - rspamd_stat_token_t *tok; rspamd_token_t *st_tok; - GArray *words; - gchar *sub = NULL; guint i, reserved_len = 0; gdouble *pdiff; guchar hout[rspamd_cryptobox_HASHBYTES]; @@ -347,55 +156,26 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, part = g_ptr_array_index (task->text_parts, i); if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { - st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, + st_ctx->tokenizer->tokenize_func (st_ctx, task, part->utf_words, IS_PART_UTF (part), NULL, task->tokens); } if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) { - msg_debug_task ("message has two common parts (%.2f), so skip the last one", + msg_debug_bayes ("message has two common parts (%.2f), so skip the last one", *pdiff); break; } } - if (task->subject != NULL) { - sub = task->subject; - } - - if (sub != NULL) { - UText utxt = UTEXT_INITIALIZER; - UErrorCode uc_err = U_ZERO_ERROR; - gsize slen = strlen (sub); - - utext_openUTF8 (&utxt, - sub, - slen, - &uc_err); - - words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF, - NULL, NULL, NULL); - - if (words != NULL) { - - for (i = 0; i < words->len; i ++) { - tok = &g_array_index (words, rspamd_stat_token_t, i); - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; - } - - st_ctx->tokenizer->tokenize_func (st_ctx, - task->task_pool, - words, - TRUE, - "SUBJECT", - task->tokens); - - rspamd_mempool_add_destructor (task->task_pool, - rspamd_array_free_hard, words); - } - - utext_close (&utxt); + if (task->meta_words != NULL) { + st_ctx->tokenizer->tokenize_func (st_ctx, + task, + task->meta_words, + TRUE, + "SUBJECT", + task->tokens); } rspamd_stat_tokenize_parts_metadata (st_ctx, task); @@ -445,10 +225,10 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx, continue; } - if (!rspamd_symbols_cache_is_symbol_enabled (task, task->cfg->cache, + if (!rspamd_symcache_is_symbol_enabled (task, task->cfg->cache, st->stcf->symbol)) { g_ptr_array_index (task->stat_runtimes, i) = NULL; - msg_debug_task ("symbol %s is disabled, skip classification", + msg_debug_bayes ("symbol %s is disabled, skip classification", st->stcf->symbol); continue; } @@ -550,6 +330,12 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx, return; } + for (i = 0; i < st_ctx->classifiers->len; i++) { + cl = g_ptr_array_index (st_ctx->classifiers, i); + cl->spam_learns = 0; + cl->ham_learns = 0; + } + for (i = 0; i < st_ctx->statfiles->len; i++) { st = g_ptr_array_index (st_ctx->statfiles, i); cl = st->classifier; @@ -591,7 +377,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx, if (bk_run == NULL) { skip = TRUE; - msg_debug_task ("disable classifier %s as statfile symbol %s is disabled", + msg_debug_bayes ("disable classifier %s as statfile symbol %s is disabled", cl->cfg->name, st->stcf->symbol); break; } @@ -600,7 +386,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx, if (!skip) { if (cl->cfg->min_tokens > 0 && task->tokens->len < cl->cfg->min_tokens) { - msg_debug_task ( + msg_debug_bayes ( "<%s> contains less tokens than required for %s classifier: " "%ud < %ud", task->message_id, @@ -610,7 +396,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx, continue; } else if (cl->cfg->max_tokens > 0 && task->tokens->len > cl->cfg->max_tokens) { - msg_debug_task ( + msg_debug_bayes ( "<%s> contains more tokens than allowed for %s classifier: " "%ud > %ud", task->message_id, @@ -740,7 +526,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx, if ((task->flags & RSPAMD_TASK_FLAG_ALREADY_LEARNED) && err != NULL && *err == NULL) { /* Do not learn twice */ - g_set_error (err, rspamd_stat_quark (), 404, "<%s> has been already " + g_set_error (err, rspamd_stat_quark (), 208, "<%s> has been already " "learned as %s, ignore it", task->message_id, spam ? "spam" : "ham"); @@ -849,7 +635,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx, if (!learned && err && *err == NULL) { if (too_large) { - g_set_error (err, rspamd_stat_quark (), 400, + g_set_error (err, rspamd_stat_quark (), 204, "<%s> contains more tokens than allowed for %s classifier: " "%d > %d", task->message_id, @@ -858,7 +644,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx, cl->cfg->max_tokens); } else if (too_small) { - g_set_error (err, rspamd_stat_quark (), 400, + g_set_error (err, rspamd_stat_quark (), 204, "<%s> contains less tokens than required for %s classifier: " "%d < %d", task->message_id, @@ -867,7 +653,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx, cl->cfg->min_tokens); } else if (conditionally_skipped) { - g_set_error (err, rspamd_stat_quark (), 410, + g_set_error (err, rspamd_stat_quark (), 204, "<%s> is skipped for %s classifier: " "%s", task->message_id, @@ -1107,7 +893,7 @@ rspamd_stat_has_classifier_symbols (struct rspamd_task *task, if (rspamd_task_find_symbol_result (task, st->stcf->symbol)) { if (is_spam == !!st->stcf->is_spam) { - msg_debug_task ("do not autolearn %s as symbol %s is already " + msg_debug_bayes ("do not autolearn %s as symbol %s is already " "added", is_spam ? "spam" : "ham", st->stcf->symbol); return TRUE; |