1 files changed, 76 insertions, 290 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index ca51d7b02..d097e12e0 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -32,273 +32,85 @@
 static const gdouble similarity_treshold = 80.0;
 
 static void
-rspamd_stat_tokenize_header (struct rspamd_task *task,
-		const gchar *name, const gchar *prefix, GArray *ar)
-{
-	struct rspamd_mime_header *cur;
-	GPtrArray *hdrs;
-	guint i;
-	rspamd_stat_token_t str;
-
-	hdrs = g_hash_table_lookup (task->raw_headers, name);
-	str.flags = RSPAMD_STAT_TOKEN_FLAG_META;
-
-	if (hdrs != NULL) {
-
-		PTR_ARRAY_FOREACH (hdrs, i, cur) {
-			if (cur->name != NULL) {
-				str.begin = cur->name;
-				str.len = strlen (cur->name);
-				g_array_append_val (ar, str);
-			}
-			if (cur->decoded != NULL) {
-				str.begin = cur->decoded;
-				str.len = strlen (cur->decoded);
-				g_array_append_val (ar, str);
-			}
-			else if (cur->value != NULL) {
-				str.begin = cur->value;
-				str.len = strlen (cur->value);
-				g_array_append_val (ar, str);
-			}
-		}
-
-		msg_debug_task ("added stat tokens for header '%s'", name);
-	}
-}
-
-static void
 rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
 		struct rspamd_task *task)
 {
-	struct rspamd_image *img;
-	struct rspamd_mime_part *part;
-	struct rspamd_mime_text_part *tp;
-	GList *cur;
 	GArray *ar;
 	rspamd_stat_token_t elt;
 	guint i;
-	gchar tmpbuf[128];
 	lua_State *L = task->cfg->lua_state;
-	const gchar *headers_hash;
-	struct rspamd_mime_header *hdr;
 
 	ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
+	memset (&elt, 0, sizeof (elt));
 	elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
 
-	/* Insert images */
-	for (i = 0; i < task->parts->len; i ++) {
-		part = g_ptr_array_index (task->parts, i);
-
-		if ((part->flags & RSPAMD_MIME_PART_IMAGE) && part->specific.img) {
-			img = part->specific.img;
-
-			/* If an image has a linked HTML part, then we push its details to the stat */
-			if (img->html_image) {
-				elt.begin = (gchar *)"image";
-				elt.len = 5;
-				g_array_append_val (ar, elt);
-				elt.begin = (gchar *)&img->html_image->height;
-				elt.len = sizeof (img->html_image->height);
-				g_array_append_val (ar, elt);
-				elt.begin = (gchar *)&img->html_image->width;
-				elt.len = sizeof (img->html_image->width);
-				g_array_append_val (ar, elt);
-				elt.begin = (gchar *)&img->type;
-				elt.len = sizeof (img->type);
-				g_array_append_val (ar, elt);
-
-				if (img->filename) {
-					elt.begin = (gchar *)img->filename;
-					elt.len = strlen (elt.begin);
-					g_array_append_val (ar, elt);
-				}
+	if (st_ctx->lua_stat_tokens_ref != -1) {
+		gint err_idx, ret;
+		GString *tb;
+		struct rspamd_task **ptask;
 
-				msg_debug_task ("added stat tokens for image '%s'", img->html_image->src);
-			}
-		}
-		else if (part->cd && part->cd->filename.len > 0) {
-			elt.begin = (gchar *)part->cd->filename.begin;
-			elt.len = part->cd->filename.len;
-			g_array_append_val (ar, elt);
-		}
-	}
+		lua_pushcfunction (L, &rspamd_lua_traceback);
+		err_idx = lua_gettop (L);
+		lua_rawgeti (L, LUA_REGISTRYINDEX, st_ctx->lua_stat_tokens_ref);
 
-	/* Process mime parts */
-	for (i = 0; i < task->parts->len; i ++) {
-		part = g_ptr_array_index (task->parts, i);
+		ptask = lua_newuserdata (L, sizeof (*ptask));
+		*ptask = task;
+		rspamd_lua_setclass (L, "rspamd{task}", -1);
 
-		if (IS_CT_MULTIPART (part->ct)) {
-			elt.begin = (gchar *)part->ct->boundary.begin;
-			elt.len = part->ct->boundary.len;
+		if ((ret = lua_pcall (L, 1, 1, err_idx)) != 0) {
+			tb = lua_touserdata (L, -1);
+			msg_err_task ("call to stat_tokens lua "
+							"script failed (%d): %v", ret, tb);
 
-			if (elt.len) {
-				msg_debug_task ("added stat tokens for mime boundary '%*s'",
-						(gint)elt.len, elt.begin);
-				g_array_append_val (ar, elt);
+			if (tb) {
+				g_string_free (tb, TRUE);
 			}
-
-			if (part->parsed_data.len > 1) {
-				rspamd_snprintf (tmpbuf, sizeof (tmpbuf), "mime%d:%dlog",
-						i, (gint)log2 (part->parsed_data.len));
-				elt.begin = rspamd_mempool_strdup (task->task_pool, tmpbuf);
-				elt.len = strlen (elt.begin);
-				g_array_append_val (ar, elt);
-			}
-		}
-	}
-
-	/* Process text parts metadata */
-	for (i = 0; i < task->text_parts->len; i ++) {
-		tp = g_ptr_array_index (task->text_parts, i);
-
-		if (tp->language != NULL && tp->language[0] != '\0') {
-			elt.begin = (gchar *)tp->language;
-			elt.len = strlen (elt.begin);
-			msg_debug_task ("added stat tokens for part language '%s'", elt.begin);
-			g_array_append_val (ar, elt);
-		}
-		if (tp->real_charset != NULL) {
-			elt.begin = (gchar *)tp->real_charset;
-			elt.len = strlen (elt.begin);
-			msg_debug_task ("added stat tokens for part charset '%s'", elt.begin);
-			g_array_append_val (ar, elt);
-		}
-	}
-
-	cur = g_list_first (task->cfg->classify_headers);
-
-	while (cur) {
-		rspamd_stat_tokenize_header (task, cur->data, "UA:", ar);
-
-		cur = g_list_next (cur);
-	}
-
-	/* Use headers order */
-	headers_hash = rspamd_mempool_get_variable (task->task_pool,
-			RSPAMD_MEMPOOL_HEADERS_HASH);
-
-	if (headers_hash) {
-		elt.begin = (gchar *)headers_hash;
-		elt.len = 16;
-		g_array_append_val (ar, elt);
-	}
-
-	/* Use more precise headers order */
-	cur = g_list_first (task->headers_order->head);
-	while (cur) {
-		hdr = cur->data;
-
-		if (hdr->name && hdr->type != RSPAMD_HEADER_RECEIVED) {
-			elt.begin = hdr->name;
-			elt.len = strlen (hdr->name);
-			g_array_append_val (ar, elt);
 		}
+		else {
+			if (lua_type (L, -1) != LUA_TTABLE) {
+				msg_err_task ("stat_tokens invocation must return "
+								"table and not %s",
+						lua_typename (L, lua_type (L, -1)));
+			}
+			else {
+				guint vlen;
+				rspamd_ftok_t tok;
 
-		cur = g_list_next (cur);
-	}
-
-	/* Use metatokens plugin from Lua */
-	lua_getglobal (L, "rspamd_plugins");
-
-	if (lua_type (L, -1) == LUA_TTABLE) {
-		lua_pushstring (L, "stat_metatokens");
-		lua_gettable (L, -2);
-
-		if (lua_type (L, -1) == LUA_TTABLE) {
-			gint old_top;
+				vlen = rspamd_lua_table_size (L, -1);
 
-			old_top = lua_gettop (L);
-			lua_pushstring (L, "callback");
-			lua_gettable (L, -2);
+				for (i = 0; i < vlen; i ++) {
+					lua_rawgeti (L, -1, i + 1);
+					tok.begin = lua_tolstring (L, -1, &tok.len);
 
-			if (lua_type (L, -1) == LUA_TFUNCTION) {
-				struct rspamd_task **ptask;
+					if (tok.begin && tok.len > 0) {
+						elt.original.begin =
+								rspamd_mempool_ftokdup (task->task_pool, &tok);
+						elt.original.len = tok.len;
+						elt.stemmed.begin = elt.original.begin;
+						elt.stemmed.len = elt.original.len;
+						elt.normalized.begin = elt.original.begin;
+						elt.normalized.len = elt.original.len;
 
-				ptask = lua_newuserdata (L, sizeof (*ptask));
-				rspamd_lua_setclass (L, "rspamd{task}", -1);
-				*ptask = task;
+						g_array_append_val (ar, elt);
+					}
 
-				if (lua_pcall (L, 1, LUA_MULTRET, 0) != 0) {
-					msg_err_task ("stat_metatokens failed: %s",
-							lua_tostring (L, -1));
 					lua_pop (L, 1);
-				} else {
-					if (lua_gettop (L) > old_top &&
-							lua_istable (L, old_top + 1)) {
-						lua_pushvalue (L, old_top + 1);
-						/* Iterate over table of tables */
-						for (lua_pushnil (L); lua_next (L, -2);
-								lua_pop (L, 1)) {
-							elt.flags = RSPAMD_STAT_TOKEN_FLAG_META|
-									RSPAMD_STAT_TOKEN_FLAG_LUA_META;
-
-							if (lua_isnumber (L, -1)) {
-								gdouble num = lua_tonumber (L, -1);
-								guint8 *pnum = rspamd_mempool_alloc (
-										task->task_pool,
-										sizeof (num));
-
-								msg_debug_task ("got metatoken number: %.2f",
-										num);
-								memcpy (pnum, &num, sizeof (num));
-								elt.begin = (gchar *) pnum;
-								elt.len = sizeof (num);
-								g_array_append_val (ar, elt);
-							} else if (lua_isstring (L, -1)) {
-								const gchar *str;
-								gsize tlen;
-
-								str = lua_tolstring (L, -1, &tlen);
-								guint8 *pstr = rspamd_mempool_alloc (
-										task->task_pool,
-										tlen);
-								memcpy (pstr, str, tlen);
-
-								msg_debug_task ("got metatoken string: %*s",
-										(gint) tlen, str);
-								elt.begin = (gchar *) pstr;
-								elt.len = tlen;
-								g_array_append_val (ar, elt);
-							}
-							else if (lua_istable (L, -1)) {
-								/* Treat that as unigramms */
-								for (lua_pushnil (L); lua_next (L, -2);
-										lua_pop (L, 1)) {
-									if (lua_isstring (L, -1)) {
-										const gchar *str;
-										gsize tlen;
-
-										str = lua_tolstring (L, -1, &tlen);
-										guint8 *pstr = rspamd_mempool_alloc (
-												task->task_pool,
-												tlen);
-										memcpy (pstr, str, tlen);
-
-										msg_debug_task ("got unigramm "
-												"metatoken string: %*s",
-												(gint) tlen, str);
-										elt.begin = (gchar *) pstr;
-										elt.len = tlen;
-										elt.flags |= RSPAMD_STAT_TOKEN_FLAG_UNIGRAM;
-										g_array_append_val (ar, elt);
-									}
-								}
-							}
-						}
-					}
 				}
 			}
 		}
+
+		lua_settop (L, 0);
 	}
 
-	lua_settop (L, 0);
-	st_ctx->tokenizer->tokenize_func (st_ctx,
-			task->task_pool,
-			ar,
-			TRUE,
-			"META:",
-			task->tokens);
+
+	if (ar->len > 0) {
+		st_ctx->tokenizer->tokenize_func (st_ctx,
+				task,
+				ar,
+				TRUE,
+				"M",
+				task->tokens);
+	}
 
 	rspamd_mempool_add_destructor (task->task_pool,
 			rspamd_array_free_hard, ar);
@@ -313,10 +125,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 {
 	struct rspamd_mime_text_part *part;
 	rspamd_cryptobox_hash_state_t hst;
-	rspamd_stat_token_t *tok;
 	rspamd_token_t *st_tok;
-	GArray *words;
-	gchar *sub = NULL;
 	guint i, reserved_len = 0;
 	gdouble *pdiff;
 	guchar hout[rspamd_cryptobox_HASHBYTES];
@@ -347,55 +156,26 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 		part = g_ptr_array_index (task->text_parts, i);
 
 		if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
-			st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
+			st_ctx->tokenizer->tokenize_func (st_ctx, task,
 					part->utf_words, IS_PART_UTF (part),
 					NULL, task->tokens);
 		}
 
 
 		if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
-			msg_debug_task ("message has two common parts (%.2f), so skip the last one",
+			msg_debug_bayes ("message has two common parts (%.2f), so skip the last one",
 					*pdiff);
 			break;
 		}
 	}
 
-	if (task->subject != NULL) {
-		sub = task->subject;
-	}
-
-	if (sub != NULL) {
-		UText utxt = UTEXT_INITIALIZER;
-		UErrorCode uc_err = U_ZERO_ERROR;
-		gsize slen = strlen (sub);
-
-		utext_openUTF8 (&utxt,
-				sub,
-				slen,
-				&uc_err);
-
-		words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
-				NULL, NULL, NULL);
-
-		if (words != NULL) {
-
-			for (i = 0; i < words->len; i ++) {
-				tok = &g_array_index (words, rspamd_stat_token_t, i);
-				tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
-			}
-
-			st_ctx->tokenizer->tokenize_func (st_ctx,
-					task->task_pool,
-					words,
-					TRUE,
-					"SUBJECT",
-					task->tokens);
-
-			rspamd_mempool_add_destructor (task->task_pool,
-					rspamd_array_free_hard, words);
-		}
-
-		utext_close (&utxt);
+	if (task->meta_words != NULL) {
+		st_ctx->tokenizer->tokenize_func (st_ctx,
+				task,
+				task->meta_words,
+				TRUE,
+				"SUBJECT",
+				task->tokens);
 	}
 
 	rspamd_stat_tokenize_parts_metadata (st_ctx, task);
@@ -445,10 +225,10 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
 			continue;
 		}
 
-		if (!rspamd_symbols_cache_is_symbol_enabled (task, task->cfg->cache,
+		if (!rspamd_symcache_is_symbol_enabled (task, task->cfg->cache,
 				st->stcf->symbol)) {
 			g_ptr_array_index (task->stat_runtimes, i) = NULL;
-			msg_debug_task ("symbol %s is disabled, skip classification",
+			msg_debug_bayes ("symbol %s is disabled, skip classification",
 					st->stcf->symbol);
 			continue;
 		}
@@ -550,6 +330,12 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 		return;
 	}
 
+	for (i = 0; i < st_ctx->classifiers->len; i++) {
+		cl = g_ptr_array_index (st_ctx->classifiers, i);
+		cl->spam_learns = 0;
+		cl->ham_learns = 0;
+	}
+
 	for (i = 0; i < st_ctx->statfiles->len; i++) {
 		st = g_ptr_array_index (st_ctx->statfiles, i);
 		cl = st->classifier;
@@ -591,7 +377,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 
 				if (bk_run == NULL) {
 					skip = TRUE;
-					msg_debug_task ("disable classifier %s as statfile symbol %s is disabled",
+					msg_debug_bayes ("disable classifier %s as statfile symbol %s is disabled",
 							cl->cfg->name, st->stcf->symbol);
 					break;
 				}
@@ -600,7 +386,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 
 		if (!skip) {
 			if (cl->cfg->min_tokens > 0 && task->tokens->len < cl->cfg->min_tokens) {
-				msg_debug_task (
+				msg_debug_bayes (
 						"<%s> contains less tokens than required for %s classifier: "
 						"%ud < %ud",
 						task->message_id,
@@ -610,7 +396,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 				continue;
 			}
 			else if (cl->cfg->max_tokens > 0 && task->tokens->len > cl->cfg->max_tokens) {
-				msg_debug_task (
+				msg_debug_bayes (
 						"<%s> contains more tokens than allowed for %s classifier: "
 						"%ud > %ud",
 						task->message_id,
@@ -740,7 +526,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
 	if ((task->flags & RSPAMD_TASK_FLAG_ALREADY_LEARNED) && err != NULL &&
 			*err == NULL) {
 		/* Do not learn twice */
-		g_set_error (err, rspamd_stat_quark (), 404, "<%s> has been already "
+		g_set_error (err, rspamd_stat_quark (), 208, "<%s> has been already "
 				"learned as %s, ignore it", task->message_id,
 				spam ? "spam" : "ham");
 
@@ -849,7 +635,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
 
 	if (!learned && err && *err == NULL) {
 		if (too_large) {
-			g_set_error (err, rspamd_stat_quark (), 400,
+			g_set_error (err, rspamd_stat_quark (), 204,
 					"<%s> contains more tokens than allowed for %s classifier: "
 					"%d > %d",
 					task->message_id,
@@ -858,7 +644,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
 					cl->cfg->max_tokens);
 		}
 		else if (too_small) {
-			g_set_error (err, rspamd_stat_quark (), 400,
+			g_set_error (err, rspamd_stat_quark (), 204,
 					"<%s> contains less tokens than required for %s classifier: "
 					"%d < %d",
 					task->message_id,
@@ -867,7 +653,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
 					cl->cfg->min_tokens);
 		}
 		else if (conditionally_skipped) {
-			g_set_error (err, rspamd_stat_quark (), 410,
+			g_set_error (err, rspamd_stat_quark (), 204,
 					"<%s> is skipped for %s classifier: "
 					"%s",
 					task->message_id,
@@ -1107,7 +893,7 @@ rspamd_stat_has_classifier_symbols (struct rspamd_task *task,
 
 		if (rspamd_task_find_symbol_result (task, st->stcf->symbol)) {
 			if (is_spam == !!st->stcf->is_spam) {
-				msg_debug_task ("do not autolearn %s as symbol %s is already "
+				msg_debug_bayes ("do not autolearn %s as symbol %s is already "
 						"added", is_spam ? "spam" : "ham", st->stcf->symbol);
 
 				return TRUE;