aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/stat_process.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libstat/stat_process.c')
-rw-r--r--src/libstat/stat_process.c366
1 files changed, 76 insertions, 290 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index ca51d7b02..d097e12e0 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -32,273 +32,85 @@
static const gdouble similarity_treshold = 80.0;
static void
-rspamd_stat_tokenize_header (struct rspamd_task *task,
- const gchar *name, const gchar *prefix, GArray *ar)
-{
- struct rspamd_mime_header *cur;
- GPtrArray *hdrs;
- guint i;
- rspamd_stat_token_t str;
-
- hdrs = g_hash_table_lookup (task->raw_headers, name);
- str.flags = RSPAMD_STAT_TOKEN_FLAG_META;
-
- if (hdrs != NULL) {
-
- PTR_ARRAY_FOREACH (hdrs, i, cur) {
- if (cur->name != NULL) {
- str.begin = cur->name;
- str.len = strlen (cur->name);
- g_array_append_val (ar, str);
- }
- if (cur->decoded != NULL) {
- str.begin = cur->decoded;
- str.len = strlen (cur->decoded);
- g_array_append_val (ar, str);
- }
- else if (cur->value != NULL) {
- str.begin = cur->value;
- str.len = strlen (cur->value);
- g_array_append_val (ar, str);
- }
- }
-
- msg_debug_task ("added stat tokens for header '%s'", name);
- }
-}
-
-static void
rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
struct rspamd_task *task)
{
- struct rspamd_image *img;
- struct rspamd_mime_part *part;
- struct rspamd_mime_text_part *tp;
- GList *cur;
GArray *ar;
rspamd_stat_token_t elt;
guint i;
- gchar tmpbuf[128];
lua_State *L = task->cfg->lua_state;
- const gchar *headers_hash;
- struct rspamd_mime_header *hdr;
ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
+ memset (&elt, 0, sizeof (elt));
elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
- /* Insert images */
- for (i = 0; i < task->parts->len; i ++) {
- part = g_ptr_array_index (task->parts, i);
-
- if ((part->flags & RSPAMD_MIME_PART_IMAGE) && part->specific.img) {
- img = part->specific.img;
-
- /* If an image has a linked HTML part, then we push its details to the stat */
- if (img->html_image) {
- elt.begin = (gchar *)"image";
- elt.len = 5;
- g_array_append_val (ar, elt);
- elt.begin = (gchar *)&img->html_image->height;
- elt.len = sizeof (img->html_image->height);
- g_array_append_val (ar, elt);
- elt.begin = (gchar *)&img->html_image->width;
- elt.len = sizeof (img->html_image->width);
- g_array_append_val (ar, elt);
- elt.begin = (gchar *)&img->type;
- elt.len = sizeof (img->type);
- g_array_append_val (ar, elt);
-
- if (img->filename) {
- elt.begin = (gchar *)img->filename;
- elt.len = strlen (elt.begin);
- g_array_append_val (ar, elt);
- }
+ if (st_ctx->lua_stat_tokens_ref != -1) {
+ gint err_idx, ret;
+ GString *tb;
+ struct rspamd_task **ptask;
- msg_debug_task ("added stat tokens for image '%s'", img->html_image->src);
- }
- }
- else if (part->cd && part->cd->filename.len > 0) {
- elt.begin = (gchar *)part->cd->filename.begin;
- elt.len = part->cd->filename.len;
- g_array_append_val (ar, elt);
- }
- }
+ lua_pushcfunction (L, &rspamd_lua_traceback);
+ err_idx = lua_gettop (L);
+ lua_rawgeti (L, LUA_REGISTRYINDEX, st_ctx->lua_stat_tokens_ref);
- /* Process mime parts */
- for (i = 0; i < task->parts->len; i ++) {
- part = g_ptr_array_index (task->parts, i);
+ ptask = lua_newuserdata (L, sizeof (*ptask));
+ *ptask = task;
+ rspamd_lua_setclass (L, "rspamd{task}", -1);
- if (IS_CT_MULTIPART (part->ct)) {
- elt.begin = (gchar *)part->ct->boundary.begin;
- elt.len = part->ct->boundary.len;
+ if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
+ tb = lua_touserdata (L, -1);
+ msg_err_task ("call to stat_tokens lua "
+ "script failed (%d): %v", ret, tb);
- if (elt.len) {
- msg_debug_task ("added stat tokens for mime boundary '%*s'",
- (gint)elt.len, elt.begin);
- g_array_append_val (ar, elt);
+ if (tb) {
+ g_string_free (tb, TRUE);
}
-
- if (part->parsed_data.len > 1) {
- rspamd_snprintf (tmpbuf, sizeof (tmpbuf), "mime%d:%dlog",
- i, (gint)log2 (part->parsed_data.len));
- elt.begin = rspamd_mempool_strdup (task->task_pool, tmpbuf);
- elt.len = strlen (elt.begin);
- g_array_append_val (ar, elt);
- }
- }
- }
-
- /* Process text parts metadata */
- for (i = 0; i < task->text_parts->len; i ++) {
- tp = g_ptr_array_index (task->text_parts, i);
-
- if (tp->language != NULL && tp->language[0] != '\0') {
- elt.begin = (gchar *)tp->language;
- elt.len = strlen (elt.begin);
- msg_debug_task ("added stat tokens for part language '%s'", elt.begin);
- g_array_append_val (ar, elt);
- }
- if (tp->real_charset != NULL) {
- elt.begin = (gchar *)tp->real_charset;
- elt.len = strlen (elt.begin);
- msg_debug_task ("added stat tokens for part charset '%s'", elt.begin);
- g_array_append_val (ar, elt);
- }
- }
-
- cur = g_list_first (task->cfg->classify_headers);
-
- while (cur) {
- rspamd_stat_tokenize_header (task, cur->data, "UA:", ar);
-
- cur = g_list_next (cur);
- }
-
- /* Use headers order */
- headers_hash = rspamd_mempool_get_variable (task->task_pool,
- RSPAMD_MEMPOOL_HEADERS_HASH);
-
- if (headers_hash) {
- elt.begin = (gchar *)headers_hash;
- elt.len = 16;
- g_array_append_val (ar, elt);
- }
-
- /* Use more precise headers order */
- cur = g_list_first (task->headers_order->head);
- while (cur) {
- hdr = cur->data;
-
- if (hdr->name && hdr->type != RSPAMD_HEADER_RECEIVED) {
- elt.begin = hdr->name;
- elt.len = strlen (hdr->name);
- g_array_append_val (ar, elt);
}
+ else {
+ if (lua_type (L, -1) != LUA_TTABLE) {
+ msg_err_task ("stat_tokens invocation must return "
+ "table and not %s",
+ lua_typename (L, lua_type (L, -1)));
+ }
+ else {
+ guint vlen;
+ rspamd_ftok_t tok;
- cur = g_list_next (cur);
- }
-
- /* Use metatokens plugin from Lua */
- lua_getglobal (L, "rspamd_plugins");
-
- if (lua_type (L, -1) == LUA_TTABLE) {
- lua_pushstring (L, "stat_metatokens");
- lua_gettable (L, -2);
-
- if (lua_type (L, -1) == LUA_TTABLE) {
- gint old_top;
+ vlen = rspamd_lua_table_size (L, -1);
- old_top = lua_gettop (L);
- lua_pushstring (L, "callback");
- lua_gettable (L, -2);
+ for (i = 0; i < vlen; i ++) {
+ lua_rawgeti (L, -1, i + 1);
+ tok.begin = lua_tolstring (L, -1, &tok.len);
- if (lua_type (L, -1) == LUA_TFUNCTION) {
- struct rspamd_task **ptask;
+ if (tok.begin && tok.len > 0) {
+ elt.original.begin =
+ rspamd_mempool_ftokdup (task->task_pool, &tok);
+ elt.original.len = tok.len;
+ elt.stemmed.begin = elt.original.begin;
+ elt.stemmed.len = elt.original.len;
+ elt.normalized.begin = elt.original.begin;
+ elt.normalized.len = elt.original.len;
- ptask = lua_newuserdata (L, sizeof (*ptask));
- rspamd_lua_setclass (L, "rspamd{task}", -1);
- *ptask = task;
+ g_array_append_val (ar, elt);
+ }
- if (lua_pcall (L, 1, LUA_MULTRET, 0) != 0) {
- msg_err_task ("stat_metatokens failed: %s",
- lua_tostring (L, -1));
lua_pop (L, 1);
- } else {
- if (lua_gettop (L) > old_top &&
- lua_istable (L, old_top + 1)) {
- lua_pushvalue (L, old_top + 1);
- /* Iterate over table of tables */
- for (lua_pushnil (L); lua_next (L, -2);
- lua_pop (L, 1)) {
- elt.flags = RSPAMD_STAT_TOKEN_FLAG_META|
- RSPAMD_STAT_TOKEN_FLAG_LUA_META;
-
- if (lua_isnumber (L, -1)) {
- gdouble num = lua_tonumber (L, -1);
- guint8 *pnum = rspamd_mempool_alloc (
- task->task_pool,
- sizeof (num));
-
- msg_debug_task ("got metatoken number: %.2f",
- num);
- memcpy (pnum, &num, sizeof (num));
- elt.begin = (gchar *) pnum;
- elt.len = sizeof (num);
- g_array_append_val (ar, elt);
- } else if (lua_isstring (L, -1)) {
- const gchar *str;
- gsize tlen;
-
- str = lua_tolstring (L, -1, &tlen);
- guint8 *pstr = rspamd_mempool_alloc (
- task->task_pool,
- tlen);
- memcpy (pstr, str, tlen);
-
- msg_debug_task ("got metatoken string: %*s",
- (gint) tlen, str);
- elt.begin = (gchar *) pstr;
- elt.len = tlen;
- g_array_append_val (ar, elt);
- }
- else if (lua_istable (L, -1)) {
- /* Treat that as unigramms */
- for (lua_pushnil (L); lua_next (L, -2);
- lua_pop (L, 1)) {
- if (lua_isstring (L, -1)) {
- const gchar *str;
- gsize tlen;
-
- str = lua_tolstring (L, -1, &tlen);
- guint8 *pstr = rspamd_mempool_alloc (
- task->task_pool,
- tlen);
- memcpy (pstr, str, tlen);
-
- msg_debug_task ("got unigramm "
- "metatoken string: %*s",
- (gint) tlen, str);
- elt.begin = (gchar *) pstr;
- elt.len = tlen;
- elt.flags |= RSPAMD_STAT_TOKEN_FLAG_UNIGRAM;
- g_array_append_val (ar, elt);
- }
- }
- }
- }
- }
}
}
}
+
+ lua_settop (L, 0);
}
- lua_settop (L, 0);
- st_ctx->tokenizer->tokenize_func (st_ctx,
- task->task_pool,
- ar,
- TRUE,
- "META:",
- task->tokens);
+
+ if (ar->len > 0) {
+ st_ctx->tokenizer->tokenize_func (st_ctx,
+ task,
+ ar,
+ TRUE,
+ "M",
+ task->tokens);
+ }
rspamd_mempool_add_destructor (task->task_pool,
rspamd_array_free_hard, ar);
@@ -313,10 +125,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
{
struct rspamd_mime_text_part *part;
rspamd_cryptobox_hash_state_t hst;
- rspamd_stat_token_t *tok;
rspamd_token_t *st_tok;
- GArray *words;
- gchar *sub = NULL;
guint i, reserved_len = 0;
gdouble *pdiff;
guchar hout[rspamd_cryptobox_HASHBYTES];
@@ -347,55 +156,26 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
part = g_ptr_array_index (task->text_parts, i);
if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
- st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
+ st_ctx->tokenizer->tokenize_func (st_ctx, task,
part->utf_words, IS_PART_UTF (part),
NULL, task->tokens);
}
if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
- msg_debug_task ("message has two common parts (%.2f), so skip the last one",
+ msg_debug_bayes ("message has two common parts (%.2f), so skip the last one",
*pdiff);
break;
}
}
- if (task->subject != NULL) {
- sub = task->subject;
- }
-
- if (sub != NULL) {
- UText utxt = UTEXT_INITIALIZER;
- UErrorCode uc_err = U_ZERO_ERROR;
- gsize slen = strlen (sub);
-
- utext_openUTF8 (&utxt,
- sub,
- slen,
- &uc_err);
-
- words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
- NULL, NULL, NULL);
-
- if (words != NULL) {
-
- for (i = 0; i < words->len; i ++) {
- tok = &g_array_index (words, rspamd_stat_token_t, i);
- tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
- }
-
- st_ctx->tokenizer->tokenize_func (st_ctx,
- task->task_pool,
- words,
- TRUE,
- "SUBJECT",
- task->tokens);
-
- rspamd_mempool_add_destructor (task->task_pool,
- rspamd_array_free_hard, words);
- }
-
- utext_close (&utxt);
+ if (task->meta_words != NULL) {
+ st_ctx->tokenizer->tokenize_func (st_ctx,
+ task,
+ task->meta_words,
+ TRUE,
+ "SUBJECT",
+ task->tokens);
}
rspamd_stat_tokenize_parts_metadata (st_ctx, task);
@@ -445,10 +225,10 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
continue;
}
- if (!rspamd_symbols_cache_is_symbol_enabled (task, task->cfg->cache,
+ if (!rspamd_symcache_is_symbol_enabled (task, task->cfg->cache,
st->stcf->symbol)) {
g_ptr_array_index (task->stat_runtimes, i) = NULL;
- msg_debug_task ("symbol %s is disabled, skip classification",
+ msg_debug_bayes ("symbol %s is disabled, skip classification",
st->stcf->symbol);
continue;
}
@@ -550,6 +330,12 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
return;
}
+ for (i = 0; i < st_ctx->classifiers->len; i++) {
+ cl = g_ptr_array_index (st_ctx->classifiers, i);
+ cl->spam_learns = 0;
+ cl->ham_learns = 0;
+ }
+
for (i = 0; i < st_ctx->statfiles->len; i++) {
st = g_ptr_array_index (st_ctx->statfiles, i);
cl = st->classifier;
@@ -591,7 +377,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
if (bk_run == NULL) {
skip = TRUE;
- msg_debug_task ("disable classifier %s as statfile symbol %s is disabled",
+ msg_debug_bayes ("disable classifier %s as statfile symbol %s is disabled",
cl->cfg->name, st->stcf->symbol);
break;
}
@@ -600,7 +386,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
if (!skip) {
if (cl->cfg->min_tokens > 0 && task->tokens->len < cl->cfg->min_tokens) {
- msg_debug_task (
+ msg_debug_bayes (
"<%s> contains less tokens than required for %s classifier: "
"%ud < %ud",
task->message_id,
@@ -610,7 +396,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
continue;
}
else if (cl->cfg->max_tokens > 0 && task->tokens->len > cl->cfg->max_tokens) {
- msg_debug_task (
+ msg_debug_bayes (
"<%s> contains more tokens than allowed for %s classifier: "
"%ud > %ud",
task->message_id,
@@ -740,7 +526,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
if ((task->flags & RSPAMD_TASK_FLAG_ALREADY_LEARNED) && err != NULL &&
*err == NULL) {
/* Do not learn twice */
- g_set_error (err, rspamd_stat_quark (), 404, "<%s> has been already "
+ g_set_error (err, rspamd_stat_quark (), 208, "<%s> has been already "
"learned as %s, ignore it", task->message_id,
spam ? "spam" : "ham");
@@ -849,7 +635,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
if (!learned && err && *err == NULL) {
if (too_large) {
- g_set_error (err, rspamd_stat_quark (), 400,
+ g_set_error (err, rspamd_stat_quark (), 204,
"<%s> contains more tokens than allowed for %s classifier: "
"%d > %d",
task->message_id,
@@ -858,7 +644,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
cl->cfg->max_tokens);
}
else if (too_small) {
- g_set_error (err, rspamd_stat_quark (), 400,
+ g_set_error (err, rspamd_stat_quark (), 204,
"<%s> contains less tokens than required for %s classifier: "
"%d < %d",
task->message_id,
@@ -867,7 +653,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
cl->cfg->min_tokens);
}
else if (conditionally_skipped) {
- g_set_error (err, rspamd_stat_quark (), 410,
+ g_set_error (err, rspamd_stat_quark (), 204,
"<%s> is skipped for %s classifier: "
"%s",
task->message_id,
@@ -1107,7 +893,7 @@ rspamd_stat_has_classifier_symbols (struct rspamd_task *task,
if (rspamd_task_find_symbol_result (task, st->stcf->symbol)) {
if (is_spam == !!st->stcf->is_spam) {
- msg_debug_task ("do not autolearn %s as symbol %s is already "
+ msg_debug_bayes ("do not autolearn %s as symbol %s is already "
"added", is_spam ? "spam" : "ham", st->stcf->symbol);
return TRUE;