[Project] Preliminary version of ngramms based language detector

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-01-13 17:41:57 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-01-13 17:41:57 +0000
commit: b72c8f94ccbbe8362b38a4a9f35823367ad21a9c (patch)
tree: c8c79328cf10c669021b9a3df761e3bd9c5d36ab /src
parent: 88950e4e4f563caba44a14a40b1180be27b772d0 (diff)
download: rspamd-b72c8f94ccbbe8362b38a4a9f35823367ad21a9c.tar.gz
rspamd-b72c8f94ccbbe8362b38a4a9f35823367ad21a9c.zip
7 files changed, 46 insertions, 28 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index fb9af6df7..66901e6b9 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -263,7 +263,7 @@ rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
 }
 
 static void
-rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords,
+rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
 		goffset *offsets_out)
 {
 	guint step_len, remainder, i, out_idx;
@@ -362,7 +362,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
 		}
 	}
 	else {
-		if (tok->len >= cur_off) {
+		if (tok->len <= cur_off) {
 			return -1;
 		}
 
@@ -406,17 +406,21 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
 
 		freq = ((gdouble)GPOINTER_TO_UINT (
 				g_hash_table_lookup (ngramms, window))) / class_freq;
-		cand = g_hash_table_lookup (candidates, elt->name);
 
-		if (cand == NULL) {
-			cand = g_malloc (sizeof (*cand));
-			cand->elt = elt;
-			cand->lang = elt->name;
-			cand->prob = freq;
-		}
-		else {
-			/* Update guess */
-			cand->prob += freq;
+		if (freq > 0) {
+			cand = g_hash_table_lookup (candidates, elt->name);
+
+			if (cand == NULL) {
+				cand = g_malloc (sizeof (*cand));
+				cand->elt = elt;
+				cand->lang = elt->name;
+				cand->prob = freq;
+
+				g_hash_table_insert (candidates, (gpointer)elt->name, cand);
+			} else {
+				/* Update guess */
+				cand->prob += freq;
+			}
 		}
 	}
 }
@@ -583,7 +587,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
 
 static void
 rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
-		GPtrArray *ucs_tokens,
+		GArray *ucs_tokens,
 		GHashTable *candidates,
 		enum rspamd_language_gramm_type type,
 		gboolean start_over)
@@ -597,7 +601,7 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
 	rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words);
 
 	/* Deal with the first word in a special case */
-	tok = g_ptr_array_index (ucs_tokens, selected_words[0]);
+	tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[0]);
 
 	if (start_over) {
 		rspamd_language_detector_detect_word (d, tok, candidates, type);
@@ -607,7 +611,7 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
 	}
 
 	for (i = 1; i < nparts; i ++) {
-		tok = g_ptr_array_index (ucs_tokens, selected_words[i]);
+		tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[i]);
 		rspamd_language_detector_update_guess (d, tok, candidates, type);
 	}
 
@@ -620,13 +624,13 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
 {
 	const struct rspamd_lang_detector_res
 			*canda = *(const struct rspamd_lang_detector_res **)a,
-			*candb = *(const struct rspamd_lang_detector_res **)a;
+			*candb = *(const struct rspamd_lang_detector_res **)b;
 
 	if (canda->prob > candb->prob) {
-		return 1;
+		return -1;
 	}
 	else if (candb->prob > canda->prob) {
-		return -1;
+		return 1;
 	}
 
 	return 0;
@@ -634,7 +638,7 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
 
 GPtrArray *
 rspamd_language_detector_detect (struct rspamd_lang_detector *d,
-		GPtrArray *ucs_tokens, gsize words_len)
+		GArray *ucs_tokens, gsize words_len)
 {
 	GHashTable *candidates;
 	GPtrArray *result;
@@ -690,6 +694,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,
 
 	while (g_hash_table_iter_next (&it, &k, &v)) {
 		cand = (struct rspamd_lang_detector_res *) v;
+		msg_err ("%s -> %.2f", cand->lang, cand->prob);
 		g_ptr_array_add (result, cand);
 		g_hash_table_iter_steal (&it);
 	}
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index 9373b09f2..048e425f6 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -55,6 +55,6 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
  * @return array of struct rspamd_lang_detector_res sorted by freq descending
  */
 GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
-		GPtrArray *ucs_tokens, gsize words_len);
+		GArray *ucs_tokens, gsize words_len);
 
 #endif
diff --git a/src/libmime/message.c b/src/libmime/message.c
index af1147770..4bac77062 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -93,6 +93,8 @@ rspamd_extract_words (struct rspamd_task *task,
 		}
 
 		if (part->ucs32_words) {
+			struct rspamd_lang_detector_res *lang;
+
 			for (i = 0; i < part->normalized_words->len; i++) {
 				w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
 
@@ -103,6 +105,16 @@ rspamd_extract_words (struct rspamd_task *task,
 				ucs_len += ucs_w.len;
 			}
 
+			part->languages = rspamd_language_detector_detect (task->lang_det,
+					part->ucs32_words, ucs_len);
+
+			if (part->languages->len > 0) {
+				lang = g_ptr_array_index (part->languages, 0);
+				part->language = lang->lang;
+
+				msg_info_task ("detected part language: %s", part->language);
+			}
+
 #ifdef WITH_SNOWBALL
 			static GHashTable *stemmers = NULL;
 
@@ -869,7 +881,6 @@ rspamd_message_parse (struct rspamd_task *task)
 
 	if (RSPAMD_TASK_IS_EMPTY (task)) {
 		/* Don't do anything with empty task */
-
 		return TRUE;
 	}
 
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 90f86b3bd..5ee5b4c43 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -78,10 +78,8 @@ struct rspamd_mime_part {
 #define IS_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
 
 struct rspamd_mime_text_part {
-	guint flags;
-	GUnicodeScript script;
-	const gchar *lang_code;
 	const gchar *language;
+	GPtrArray *languages;
 	const gchar *real_charset;
 	rspamd_ftok_t raw;
 	rspamd_ftok_t parsed;
@@ -95,6 +93,7 @@ struct rspamd_mime_text_part {
 	GArray *normalized_words;
 	GArray *ucs32_words;
 	GArray *normalized_hashes;
+	guint flags;
 	guint nlines;
 	guint spaces;
 	guint non_ascii_chars;
diff --git a/src/libserver/task.c b/src/libserver/task.c
index 7b665d983..961af1c9f 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -242,6 +242,9 @@ rspamd_task_free (struct rspamd_task *task)
 			if (tp->ucs32_words) {
 				g_array_free (tp->ucs32_words, TRUE);
 			}
+			if (tp->languages) {
+				g_ptr_array_free (tp->languages, TRUE);
+			}
 		}
 
 		if (task->rcpt_envelope) {
diff --git a/src/libstat/backends/sqlite3_backend.c b/src/libstat/backends/sqlite3_backend.c
index 6563d0dc1..8682ca73f 100644
--- a/src/libstat/backends/sqlite3_backend.c
+++ b/src/libstat/backends/sqlite3_backend.c
@@ -365,8 +365,8 @@ rspamd_sqlite3_get_language (struct rspamd_stat_sqlite3_db *db,
 		for (i = 0; i < task->text_parts->len; i++) {
 			tp = g_ptr_array_index (task->text_parts, i);
 
-			if (tp->lang_code != NULL && tp->lang_code[0] != '\0' &&
-					strcmp (tp->lang_code, "en") != 0) {
+			if (tp->language != NULL && tp->language[0] != '\0' &&
+					strcmp (tp->language, "en") != 0) {
 				language = tp->language;
 				break;
 			}
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index cce78ff3a..7fc8f74ac 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -745,8 +745,8 @@ lua_textpart_get_language (lua_State * L)
 	struct rspamd_mime_text_part *part = lua_check_textpart (L);
 
 	if (part != NULL) {
-		if (part->lang_code != NULL && part->lang_code[0] != '\0') {
-			lua_pushstring (L, part->lang_code);
+		if (part->language != NULL && part->language[0] != '\0') {
+			lua_pushstring (L, part->language);
 			return 1;
 		}
 	}
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-01-13 17:41:57 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-01-13 17:41:57 +0000
commit	b72c8f94ccbbe8362b38a4a9f35823367ad21a9c (patch)
tree	c8c79328cf10c669021b9a3df761e3bd9c5d36ab /src
parent	88950e4e4f563caba44a14a40b1180be27b772d0 (diff)
download	rspamd-b72c8f94ccbbe8362b38a4a9f35823367ad21a9c.tar.gz rspamd-b72c8f94ccbbe8362b38a4a9f35823367ad21a9c.zip