[Rework] Rework learn and add classify condition

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2021-09-01 14:26:32 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2021-09-01 14:26:32 +0100
commit: 718238fd33017f346d1e84fe757481f9f147eb90 (patch)
tree: 34140ab35d6b9709d3c8ff45c8c1a7501ee44dd9 /src/libstat
parent: 6b80e5120a9edeebee4e266fc17c81e2a5ddaf40 (diff)
download: rspamd-718238fd33017f346d1e84fe757481f9f147eb90.tar.gz
rspamd-718238fd33017f346d1e84fe757481f9f147eb90.zip
1 files changed, 100 insertions, 80 deletions
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 8ac4e499e..4e856b563 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -190,9 +190,75 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 			b32_hout, g_free);
 }
 
+static gboolean
+rspamd_stat_classifier_is_skipped (struct rspamd_task *task,
+		struct rspamd_classifier *cl, gboolean is_learn, gboolean is_spam)
+{
+	GList *cur = is_learn ? cl->cfg->learn_conditions : cl->cfg->classify_conditions;
+	lua_State *L = task->cfg->lua_state;
+	gboolean ret = FALSE;
+
+	while (cur) {
+		gint cb_ref = GPOINTER_TO_INT (cur->data);
+		gint old_top = lua_gettop (L);
+
+		lua_rawgeti (L, LUA_REGISTRYINDEX, cb_ref);
+		/* Push task and two booleans: is_spam and is_unlearn */
+		struct rspamd_task **ptask = lua_newuserdata (L, sizeof (*ptask));
+		*ptask = task;
+		rspamd_lua_setclass (L, "rspamd{task}", -1);
+
+		if (is_learn) {
+			lua_pushboolean(L, is_spam);
+			lua_pushboolean(L,
+					task->flags & RSPAMD_TASK_FLAG_UNLEARN ? true : false);
+		}
+
+		if (lua_pcall (L, 3, LUA_MULTRET, 0) != 0) {
+			msg_err_task ("call to %s failed: %s",
+					"condition callback",
+					lua_tostring (L, -1));
+		}
+		else {
+			if (lua_isboolean (L, 1)) {
+				if (!lua_toboolean (L, 1)) {
+					ret = TRUE;
+				}
+			}
+
+			if (lua_isstring (L, 2)) {
+				if (ret) {
+					msg_notice_task ("%s condition for classifier %s returned: %s; skip classifier",
+							is_learn ? "learn" : "classify", cl->cfg->name,
+							lua_tostring(L, 2));
+				}
+				else {
+					msg_info_task ("%s condition for classifier %s returned: %s",
+							is_learn ? "learn" : "classify", cl->cfg->name,
+							lua_tostring(L, 2));
+				}
+			}
+			else if (ret) {
+				msg_notice_task("%s condition for classifier %s returned false; skip classifier",
+						is_learn ? "learn" : "classify", cl->cfg->name);
+			}
+
+			if (ret) {
+				lua_settop (L, old_top);
+				break;
+			}
+		}
+
+		lua_settop (L, old_top);
+		cur = g_list_next (cur);
+	}
+
+	return ret;
+}
+
 static void
 rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
-		struct rspamd_task *task, gboolean learn)
+		struct rspamd_task *task, gboolean is_learn, gboolean is_spam)
 {
 	guint i;
 	struct rspamd_statfile *st;
@@ -207,12 +273,39 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
 	rspamd_mempool_add_destructor (task->task_pool,
 			rspamd_ptr_array_free_hard, task->stat_runtimes);
 
+	/* Temporary set all stat_runtimes to some max size to distinguish from NULL */
+	for (i = 0; i < st_ctx->statfiles->len; i ++) {
+		g_ptr_array_index (task->stat_runtimes, i) = GSIZE_TO_POINTER(G_MAXSIZE);
+	}
+
+	for (i = 0; i < st_ctx->classifiers->len; i++) {
+		struct rspamd_classifier *cl = g_ptr_array_index (st_ctx->classifiers, i);
+		gboolean skip_classifier = FALSE;
+
+		if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
+			skip_classifier = TRUE;
+		}
+		else {
+			if (rspamd_stat_classifier_is_skipped (task, cl, is_learn , is_spam)) {
+				skip_classifier = TRUE;
+			}
+		}
+
+		if (skip_classifier) {
+			/* Set NULL for all statfiles indexed by id */
+			for (int j = 0; j < cl->statfiles_ids->len; j++) {
+				int id = g_array_index (cl->statfiles_ids, gint, j);
+				g_ptr_array_index (task->stat_runtimes, id) = NULL;
+			}
+		}
+	}
+
 	for (i = 0; i < st_ctx->statfiles->len; i ++) {
 		st = g_ptr_array_index (st_ctx->statfiles, i);
 		g_assert (st != NULL);
 
-		if (st->classifier->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
-			g_ptr_array_index (task->stat_runtimes, i) = NULL;
+		if (g_ptr_array_index (task->stat_runtimes, i) == NULL) {
+			/* The whole classifier is skipped */
 			continue;
 		}
 
@@ -224,7 +317,7 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
 			continue;
 		}
 
-		bk_run = st->backend->runtime (task, st->stcf, learn, st->bkcf);
+		bk_run = st->backend->runtime (task, st->stcf, is_learn, st->bkcf);
 
 		if (bk_run == NULL) {
 			msg_err_task ("cannot init backend %s for statfile %s",
@@ -249,11 +342,6 @@ rspamd_stat_backends_process (struct rspamd_stat_ctx *st_ctx,
 	for (i = 0; i < st_ctx->statfiles->len; i++) {
 		st = g_ptr_array_index (st_ctx->statfiles, i);
 		cl = st->classifier;
-
-		if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
-			continue;
-		}
-
 		bk_run = g_ptr_array_index (task->stat_runtimes, i);
 
 		if (bk_run != NULL) {
@@ -302,10 +390,6 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 		st = g_ptr_array_index (st_ctx->statfiles, i);
 		cl = st->classifier;
 
-		if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
-			continue;
-		}
-
 		bk_run = g_ptr_array_index (task->stat_runtimes, i);
 		g_assert (st != NULL);
 
@@ -332,10 +416,6 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 
 		/* Do not process classifiers on backend failures */
 		for (j = 0; j < cl->statfiles_ids->len; j++) {
-			if (cl->cfg->flags & RSPAMD_FLAG_CLASSIFIER_NO_BACKEND) {
-				continue;
-			}
-
 			id = g_array_index (cl->statfiles_ids, gint, j);
 			bk_run =  g_ptr_array_index (task->stat_runtimes, id);
 			st = g_ptr_array_index (st_ctx->statfiles, id);
@@ -406,7 +486,7 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, guint stage,
 
 	if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS_PRE) {
 		/* Preprocess tokens */
-		rspamd_stat_preprocess (st_ctx, task, FALSE);
+		rspamd_stat_preprocess (st_ctx, task, FALSE, FALSE);
 	}
 	else if (stage == RSPAMD_TASK_STAGE_CLASSIFIERS) {
 		/* Process backends */
@@ -490,13 +570,7 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
 {
 	struct rspamd_classifier *cl, *sel = NULL;
 	guint i;
-	gboolean learned = FALSE, too_small = FALSE, too_large = FALSE,
-			conditionally_skipped = FALSE;
-	lua_State *L;
-	struct rspamd_task **ptask;
-	GList *cur;
-	gint cb_ref;
-	gchar *cond_str = NULL;
+	gboolean learned = FALSE, too_small = FALSE, too_large = FALSE;
 
 	if ((task->flags & RSPAMD_TASK_FLAG_ALREADY_LEARNED) && err != NULL &&
 			*err == NULL) {
@@ -544,52 +618,6 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
 			continue;
 		}
 
-		/* Check all conditions for this classifier */
-		cur = cl->cfg->learn_conditions;
-		L = task->cfg->lua_state;
-
-		while (cur) {
-			cb_ref = GPOINTER_TO_INT (cur->data);
-
-			gint old_top = lua_gettop (L);
-			lua_rawgeti (L, LUA_REGISTRYINDEX, cb_ref);
-			/* Push task and two booleans: is_spam and is_unlearn */
-			ptask = lua_newuserdata (L, sizeof (*ptask));
-			*ptask = task;
-			rspamd_lua_setclass (L, "rspamd{task}", -1);
-			lua_pushboolean (L, spam);
-			lua_pushboolean (L,
-					task->flags & RSPAMD_TASK_FLAG_UNLEARN ? true : false);
-
-			if (lua_pcall (L, 3, LUA_MULTRET, 0) != 0) {
-				msg_err_task ("call to %s failed: %s",
-						"condition callback",
-						lua_tostring (L, -1));
-			}
-			else {
-				if (lua_isboolean (L, 1)) {
-					if (!lua_toboolean (L, 1)) {
-						conditionally_skipped = TRUE;
-						/* Also check for error string if needed */
-						if (lua_isstring (L, 2)) {
-							cond_str = rspamd_mempool_strdup (task->task_pool,
-									lua_tostring (L, 2));
-						}
-
-						lua_settop (L, old_top);
-						break;
-					}
-				}
-			}
-
-			lua_settop (L, old_top);
-			cur = g_list_next (cur);
-		}
-
-		if (conditionally_skipped) {
-			break;
-		}
-
 		if (cl->subrs->learn_spam_func (cl, task->tokens, task, spam,
 				task->flags & RSPAMD_TASK_FLAG_UNLEARN, err)) {
 			learned = TRUE;
@@ -627,14 +655,6 @@ rspamd_stat_classifiers_learn (struct rspamd_stat_ctx *st_ctx,
 					task->tokens->len,
 					sel->cfg->min_tokens);
 		}
-		else if (conditionally_skipped) {
-			g_set_error (err, rspamd_stat_quark (), 204,
-					"<%s> is skipped for %s classifier: "
-					"%s",
-					MESSAGE_FIELD (task, message_id),
-					sel->cfg->name,
-					cond_str ? cond_str : "unknown reason");
-		}
 	}
 
 	return learned;
@@ -828,7 +848,7 @@ rspamd_stat_learn (struct rspamd_task *task,
 
 	if (stage == RSPAMD_TASK_STAGE_LEARN_PRE) {
 		/* Process classifiers */
-		rspamd_stat_preprocess (st_ctx, task, TRUE);
+		rspamd_stat_preprocess (st_ctx, task, TRUE, spam);
 
 		if (!rspamd_stat_cache_check (st_ctx, task, classifier, spam, err)) {
 			return RSPAMD_STAT_PROCESS_ERROR;
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2021-09-01 14:26:32 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2021-09-01 14:26:32 +0100
commit	718238fd33017f346d1e84fe757481f9f147eb90 (patch)
tree	34140ab35d6b9709d3c8ff45c8c1a7501ee44dd9 /src/libstat
parent	6b80e5120a9edeebee4e266fc17c81e2a5ddaf40 (diff)
download	rspamd-718238fd33017f346d1e84fe757481f9f147eb90.tar.gz rspamd-718238fd33017f346d1e84fe757481f9f147eb90.zip