From 4e85642cbab29192b48c7f662d2ab2921912d9bc Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Sun, 4 Feb 2018 20:34:16 +0000
Subject: [PATCH] [Minor] Various tweaks towards improvements in language
 detection

---
 src/libmime/lang_detection.c | 50 +++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 4eb5023f2..1c3a5bded 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -87,8 +87,11 @@ enum rspamd_language_elt_flags {
 struct rspamd_language_elt {
 	const gchar *name; /* e.g. "en" or "ru" */
 	enum rspamd_language_elt_flags flags;
-	guint unigramms_total; /* total frequencies for unigramms */
-	guint trigramms_total; /* total frequencies for trigramms */
+	guint ngramms_total;
+	guint unigramms_words;
+	guint trigramms_words;
+	gdouble mean;
+	gdouble std;
 	guint occurencies; /* total number of parts with this language */
 };
 
@@ -358,9 +361,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 		return;
 	}
 	else {
-		nelt->unigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
+		nelt->unigramms_words = ucl_object_toint (ucl_array_find_index (n_words,
 				0));
-		nelt->trigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
+		nelt->trigramms_words = ucl_object_toint (ucl_array_find_index (n_words,
 				2));
 	}
 
@@ -380,10 +383,6 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 		if (rspamd_language_search_str (nelt->name, unigramms_langs,
 				G_N_ELEMENTS (unigramms_langs))) {
 			nelt->flags |= RS_LANGUAGE_UNIGRAMM;
-			total = nelt->unigramms_total;
-		}
-		else {
-			total = nelt->trigramms_total;
 		}
 
 		if (rspamd_language_search_str (nelt->name, tier1_langs,
@@ -480,11 +479,6 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 			}
 
 			/* Now, discriminate low frequency ngramms */
-			if (ucs_elt->freq < mean) {
-				ucs_elt->freq = 0;
-				skipped ++;
-				continue;
-			}
 
 			total += ucs_elt->freq;
 			loaded ++;
@@ -494,8 +488,8 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 
 		PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
 			if (ucs_elt->freq > 0) {
-				rspamd_language_detector_init_ngramm (cfg, d, nelt, ucs_elt->s,
-						nsym,
+				rspamd_language_detector_init_ngramm (cfg, d,
+						nelt, ucs_elt->s, nsym,
 						ucs_elt->freq, total);
 			}
 		}
@@ -509,15 +503,18 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 		}
 
 		g_ptr_array_free (ngramms, TRUE);
+		nelt->mean = mean;
+		nelt->std = std;
+		nelt->ngramms_total = total;
 		msg_info_config ("loaded %s language, %d unigramms, %d trigramms, "
 				"%d ngramms loaded; "
-				"std=%.2f, mean=%.2f, discrimination=%.2f, skipped=%d, loaded=%d; "
+				"std=%.2f, mean=%.2f, skipped=%d, loaded=%d; "
 				"(%s)",
 				nelt->name,
-				(gint)nelt->unigramms_total,
-				(gint)nelt->trigramms_total,
+				(gint)nelt->unigramms_words,
+				(gint)nelt->trigramms_words,
 				total,
-				std, mean, mean + std / 2.0,
+				std, mean,
 				skipped, loaded,
 				rspamd_language_detector_print_flags (nelt));
 	}
@@ -663,7 +660,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
 {
 	guint step_len, remainder, i, out_idx;
 	guint64 coin, sel;
-	goffset tmp;
 	rspamd_stat_token_t *tok;
 
 	g_assert (nwords != 0);
@@ -811,7 +807,7 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
 	struct rspamd_ngramm_elt *elt;
 	struct rspamd_lang_detector_res *cand;
 	GHashTable *ngramms;
-	gdouble mult = 1.0, prob;
+	gdouble prob;
 
 	switch (type) {
 	case rs_unigramm:
@@ -828,9 +824,15 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
 	if (ar) {
 		PTR_ARRAY_FOREACH (ar, i, elt) {
 			cand = g_hash_table_lookup (candidates, elt->elt->name);
-
-			prob = elt->prob * mult;
-
+			prob = elt->prob;
+#ifdef NGRAMMS_DEBUG
+			UConverter *ucnv;
+			UErrorCode uc_err = U_ZERO_ERROR;
+			char buf[1024];
+			ucnv = ucnv_open ("UTF-8", &uc_err);
+			ucnv_fromUChars (ucnv, buf, sizeof (buf), window, 3, &uc_err);
+			msg_err ("gramm: %s, lang: %s, prob: %.3f", buf, elt->elt->name, log2 (elt->prob));
+#endif
 			if (cand == NULL) {
 				cand = g_malloc (sizeof (*cand));
 				cand->elt = elt->elt;
-- 
2.39.5