[Fix] Really fix the language detector statistical heuristic

author: Vsevolod Stakhov <vsevolod@rspamd.com> 2024-01-18 14:13:41 +0000
committer: Vsevolod Stakhov <vsevolod@rspamd.com> 2024-01-18 14:13:41 +0000
commit: 2da61df6d8af753b71becdeb6adec122fee5a0ba (patch)
tree: eafa5741ea986664e3604dd96ffca775cdff1fec
parent: 1fee9a39017d946152eab4f29e0d143db13ae951 (diff)
download: rspamd-2da61df6d8af753b71becdeb6adec122fee5a0ba.tar.gz
rspamd-2da61df6d8af753b71becdeb6adec122fee5a0ba.zip
1 files changed, 26 insertions, 12 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index c44aa2b04..383005ad1 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1335,14 +1335,15 @@ rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b,
 									   gpointer ud)
 {
 	struct rspamd_frequency_sort_cbdata *cbd = ud;
-	const struct rspamd_lang_detector_res
-		*canda = *(const struct rspamd_lang_detector_res **) a,
-		*candb = *(const struct rspamd_lang_detector_res **) b;
+	struct rspamd_lang_detector_res
+		*canda = *(struct rspamd_lang_detector_res **) a,
+		*candb = *(struct rspamd_lang_detector_res **) b;
 	gdouble adj;
 	gdouble proba_adjusted, probb_adjusted, freqa, freqb;
 
 	if (cbd->d->total_occurrences == 0) {
-		return 0;
+		/* Not enough data, compare directly */
+		return rspamd_language_detector_cmp(a, b);
 	}
 
 	freqa = ((gdouble) canda->elt->occurrences) /
@@ -1387,6 +1388,10 @@ rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b,
 		probb_adjusted += cbd->std * adj;
 	}
 
+	/* Hack: adjust probability directly */
+	canda->prob = proba_adjusted;
+	candb->prob = probb_adjusted;
+
 	if (proba_adjusted > probb_adjusted) {
 		return -1;
 	}
@@ -1998,7 +2003,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
 
 			kh_foreach_value(candidates, cand, {
 				if (!isnan(cand->prob)) {
-					msg_debug_lang_det("final probability %s -> %.2f", cand->lang,
+					msg_debug_lang_det("pre-sorting probability %s -> %.2f", cand->lang,
 									   cand->prob);
 					g_ptr_array_add(result, cand);
 				}
@@ -2006,18 +2011,18 @@ rspamd_language_detector_detect(struct rspamd_task *task,
 
 			if (frequency_heuristic_applied) {
 				g_ptr_array_sort_with_data(result,
-										   rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
+										   rspamd_language_detector_cmp_heuristic,
+										   (gpointer) &cbd);
 			}
 			else {
 				g_ptr_array_sort(result, rspamd_language_detector_cmp);
 			}
 
-			if (result->len > 0 && !frequency_heuristic_applied) {
-				cand = g_ptr_array_index(result, 0);
-				if (cand->elt) {
-					cand->elt->occurrences++;
-				}
-				d->total_occurrences++;
+			int i;
+			PTR_ARRAY_FOREACH(result, i, cand)
+			{
+				msg_debug_lang_det("final probability %s -> %.2f", cand->lang,
+								   cand->prob);
 			}
 
 			if (part->languages != NULL) {
@@ -2035,6 +2040,15 @@ rspamd_language_detector_detect(struct rspamd_task *task,
 		kh_destroy(rspamd_candidates_hash, candidates);
 	}
 
+	/* Update internal stat */
+	if (part->languages != NULL && part->languages->len > 0 && !frequency_heuristic_applied) {
+		cand = g_ptr_array_index(part->languages, 0);
+		if (cand->elt) {
+			cand->elt->occurrences++;
+		}
+		d->total_occurrences++;
+	}
+
 	end_ticks = rspamd_get_ticks(TRUE);
 	msg_debug_lang_det("detected languages in %.0f ticks",
 					   (end_ticks - start_ticks));
author	Vsevolod Stakhov <vsevolod@rspamd.com>	2024-01-18 14:13:41 +0000
committer	Vsevolod Stakhov <vsevolod@rspamd.com>	2024-01-18 14:13:41 +0000
commit	2da61df6d8af753b71becdeb6adec122fee5a0ba (patch)
tree	eafa5741ea986664e3604dd96ffca775cdff1fec
parent	1fee9a39017d946152eab4f29e0d143db13ae951 (diff)
download	rspamd-2da61df6d8af753b71becdeb6adec122fee5a0ba.tar.gz rspamd-2da61df6d8af753b71becdeb6adec122fee5a0ba.zip