From bb70d081392c8bd57dd456f57b8c1aa0f58f0f71 Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Sat, 27 Jan 2018 13:17:50 +0000
Subject: [PATCH] [Project] Add more flags to languages

---
 src/libmime/lang_detection.c | 189 ++++++++++++++++++++++++++---------
 1 file changed, 141 insertions(+), 48 deletions(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 75be74f25..64e820b31 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -29,12 +29,6 @@ static const gsize default_words = 30;
 static const gdouble update_prob = 0.6;
 static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
 
-enum rspamd_language_elt_flags {
-	RS_LANGUAGE_DEFAULT = 0,
-	RS_LANGUAGE_LATIN = (1 << 0),
-	RS_LANGUAGE_TIER1 = (1 << 1),
-	RS_LANGUAGE_TIER2 = (1 << 2),
-};
 
 struct rspamd_language_unicode_match {
 	const gchar *lang;
@@ -69,6 +63,22 @@ static const gchar *unigramms_langs[] = {
 		"zh-TW"
 };
 
+/*
+ * Top languages
+ */
+static const gchar *tier1_langs[] = {
+		"en", "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
+		"ko", "pt", "ru", "pl", "tk", "th", "ar"
+};
+
+enum rspamd_language_elt_flags {
+	RS_LANGUAGE_DEFAULT = 0,
+	RS_LANGUAGE_LATIN = (1 << 0),
+	RS_LANGUAGE_UNISCRIPT = (1 << 1),
+	RS_LANGUAGE_UNIGRAMM = (1 << 2),
+	RS_LANGUAGE_TIER1 = (1 << 3),
+	RS_LANGUAGE_TIER2 = (1 << 4),
+};
 
 struct rspamd_language_elt {
 	const gchar *name; /* e.g. "en" or "ru" */
@@ -100,6 +110,34 @@ struct rspamd_lang_detector {
 
 INIT_LOG_MODULE(langdet)
 
+static const struct rspamd_language_unicode_match *
+rspamd_language_search_unicode_match (const gchar *key,
+		const struct rspamd_language_unicode_match *elts, size_t nelts)
+{
+	size_t i;
+
+	for (i = 0; i < nelts; i++) {
+		if (strcmp (elts[i].lang, key) == 0) {
+			return &elts[i];
+		}
+	}
+
+	return NULL;
+}
+
+static gboolean
+rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts)
+{
+	size_t i;
+
+	for (i = 0; i < nelts; i++) {
+		if (strcmp (elts[i], key) == 0) {
+			return TRUE;
+		}
+	}
+	return FALSE;
+}
+
 static guint
 rspamd_unigram_hash (gconstpointer key)
 {
@@ -210,6 +248,11 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
 	}
 }
 
+struct rspamd_language_ucs_elt {
+	guint freq;
+	UChar s[0];
+};
+
 static void
 rspamd_language_detector_read_file (struct rspamd_config *cfg,
 		struct rspamd_lang_detector *d,
@@ -221,8 +264,10 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 	ucl_object_iter_t it = NULL;
 	UErrorCode uc_err = U_ZERO_ERROR;
 	struct rspamd_language_elt *nelt;
+	const struct rspamd_language_unicode_match *uc_match;
+	struct rspamd_language_ucs_elt *ucs_elt;
 	gchar *pos;
-	guint total = 0, total_latin = 0, total_ngramms = 0;
+	guint total = 0, total_latin = 0, total_ngramms = 0, i;
 
 	parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS);
 	if (!ucl_parser_add_file (parser, path)) {
@@ -270,58 +315,105 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 				2));
 	}
 
-	while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
-		const gchar *key;
-		gsize keylen;
-		guint freq, nsym;
-		UChar *ucs_key;
+	if ((uc_match = rspamd_language_search_unicode_match (nelt->name, unicode_langs,
+			G_N_ELEMENTS (unicode_langs))) != NULL) {
+		g_hash_table_insert (d->unicode_scripts, (gpointer)&uc_match->unicode_code,
+				nelt);
+		nelt->flags |= RS_LANGUAGE_UNISCRIPT;
+	}
+	else {
+		GPtrArray *ngramms;
+		guint nsym;
 
-		key = ucl_object_keyl (cur, &keylen);
-		freq = ucl_object_toint (cur);
+		if (rspamd_language_search_str (nelt->name, unigramms_langs,
+				G_N_ELEMENTS (unigramms_langs))) {
+			nelt->flags |= RS_LANGUAGE_UNIGRAMM;
+			total = nelt->unigramms_total;
+		}
+		else {
+			total = nelt->trigramms_total;
+		}
 
-		if (key != NULL) {
-			ucs_key = rspamd_mempool_alloc (cfg->cfg_pool,
-					(keylen + 1) * sizeof (UChar));
+		it = NULL;
+		ngramms = g_ptr_array_sized_new (freqs->len);
+
+		while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
+			const gchar *key;
+			gsize keylen;
+			guint freq;
+
+			key = ucl_object_keyl (cur, &keylen);
+			freq = ucl_object_toint (cur);
+
+			if (key != NULL) {
+				ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
+						sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar));
+
+				nsym = ucnv_toUChars (d->uchar_converter,
+						ucs_elt->s, keylen + 1,
+						key,
+						keylen, &uc_err);
+
+				if (uc_err != U_ZERO_ERROR) {
+					msg_warn_config ("cannot convert key to unicode: %s",
+							u_errorName (uc_err));
+
+					continue;
+				}
+
+				rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
+
+				if (nsym == 2) {
+					/* We have a digraph */
+					continue;
+				}
+				else if (nsym == 3 && !(nelt->flags & RS_LANGUAGE_UNIGRAMM)) {
+					g_ptr_array_add (ngramms, ucs_elt);
+				}
+				else if (nsym == 1 && nelt->flags & RS_LANGUAGE_UNIGRAMM) {
+					g_ptr_array_add (ngramms, ucs_elt);
+				}
+				else if (nsym > 3) {
+					msg_warn_config ("have more than 3 characters in key: %d",
+							nsym);
+					continue;
+				}
+
+				if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+					total_latin++;
+				}
+
+				ucs_elt->freq = freq;
+
+				total_ngramms++;
+			}
+		}
 
-			nsym = ucnv_toUChars (d->uchar_converter, ucs_key, keylen + 1, key,
-					keylen, &uc_err);
+		if (total_latin >= total_ngramms * 2 / 3) {
+			nelt->flags |= RS_LANGUAGE_LATIN;
+		}
 
-			if (uc_err != U_ZERO_ERROR) {
-				msg_warn_config ("cannot convert key to unicode: %s",
-						u_errorName (uc_err));
+		if (nelt->flags & RS_LANGUAGE_UNIGRAMM) {
+			nsym = 1;
+		}
+		else {
+			nsym = 3;
+		}
 
-				continue;
-			}
+		PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
 
-			rspamd_language_detector_ucs_lowercase (ucs_key, nsym);
-			if (nsym == 2) {
-				/* We have a digraph */
-				continue;
-			}
-			else if (nsym == 3) {
-				total = nelt->trigramms_total;
-			}
-			else if (nsym == 1) {
-				total = nelt->unigramms_total;
-			}
-			else if (nsym > 3) {
-				msg_warn_config ("have more than 3 characters in key: %d", nsym);
+			if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
+					rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
+				/* Skip latin ngramm for non-latin language to avoid garbadge */
 				continue;
 			}
 
-			rspamd_language_detector_init_ngramm (cfg, d, nelt, ucs_key, nsym,
-					freq, total);
-
-			if (rspamd_language_detector_ucs_is_latin (ucs_key, nsym)) {
-				total_latin ++;
-			}
-
-			total_ngramms ++;
+			rspamd_language_detector_init_ngramm (cfg, d, nelt, ucs_elt->s,
+					nsym,
+					ucs_elt->freq, total);
 		}
-	}
 
-	if (total_latin >= total_ngramms * 2 / 3) {
-		nelt->flags |= RS_LANGUAGE_LATIN;
+		g_ptr_array_free (ngramms, TRUE);
 	}
 
 	msg_info_config ("loaded %s language, %d unigramms, %d trigramms",
@@ -403,6 +495,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
 			rspamd_unigram_equal, NULL, rspamd_ptr_array_free_hard);
 	ret->trigramms = g_hash_table_new_full (rspamd_trigram_hash,
 			rspamd_trigram_equal, NULL, rspamd_ptr_array_free_hard);
+	ret->unicode_scripts = g_hash_table_new (g_int_hash, g_int_equal);
 
 	g_assert (uc_err == U_ZERO_ERROR);
 
-- 
2.39.5