[Feature] Do not detect language twice for relative parts

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-04-06 17:32:15 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2018-04-06 17:32:15 +0100
commit: 6e925ae527b87d75c8346d44c27b55252cc78dbf (patch)
tree: b5bb829351ffd70028d8563d25503ef419953b1c /src/libmime
parent: b4e985a0b9a0ad8e79a3c5c55191798e3cb32e0d (diff)
download: rspamd-6e925ae527b87d75c8346d44c27b55252cc78dbf.tar.gz
rspamd-6e925ae527b87d75c8346d44c27b55252cc78dbf.zip
2 files changed, 178 insertions, 117 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index aaae40fa5..1eae26a43 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -55,157 +55,106 @@ free_byte_array_callback (void *pointer)
 }
 
 static void
-rspamd_extract_words (struct rspamd_task *task,
+rspamd_mime_part_extract_words (struct rspamd_task *task,
 		struct rspamd_mime_text_part *part)
 {
 #ifdef WITH_SNOWBALL
 	struct sb_stemmer *stem = NULL;
 #endif
-	rspamd_stat_token_t *w, ucs_w;
+	rspamd_stat_token_t *w;
 	gchar *temp_word;
 	const guchar *r;
-	guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0;
+	guint i, nlen, total_len = 0, short_len = 0;
 	gdouble avg_len = 0;
 
-	/* Ugly workaround */
-	if (IS_PART_HTML (part)) {
-		part->normalized_words = rspamd_tokenize_text (
-				part->stripped_content->data,
-				part->stripped_content->len, IS_PART_UTF (part), task->cfg,
-				part->exceptions, FALSE,
-				NULL);
-	}
-	else {
-		part->normalized_words = rspamd_tokenize_text (
-				part->stripped_content->data,
-				part->stripped_content->len, IS_PART_UTF (part), task->cfg,
-				part->exceptions, FALSE,
-				NULL);
-	}
-
 	if (part->normalized_words) {
-		part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
-				sizeof (guint64), part->normalized_words->len);
-
-		if (IS_PART_UTF (part) && task->lang_det) {
-			part->ucs32_words =  g_array_sized_new (FALSE, FALSE,
-					sizeof (rspamd_stat_token_t), part->normalized_words->len);
-		}
-
-		if (part->ucs32_words) {
-			struct rspamd_lang_detector_res *lang;
-
-			for (i = 0; i < part->normalized_words->len; i++) {
-				w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
-
-				if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
-					rspamd_language_detector_to_ucs (task->lang_det,
-							task->task_pool,
-							w, &ucs_w);
-					g_array_append_val (part->ucs32_words, ucs_w);
-					ucs_len += ucs_w.len;
-				}
-			}
-
-			part->languages = rspamd_language_detector_detect (task,
-					task->lang_det,
-					part->ucs32_words, ucs_len);
+#ifdef WITH_SNOWBALL
+		static GHashTable *stemmers = NULL;
 
-			if (part->languages->len > 0) {
-				lang = g_ptr_array_index (part->languages, 0);
-				part->language = lang->lang;
+		if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
 
-				msg_info_task ("detected part language: %s", part->language);
+			if (!stemmers) {
+				stemmers = g_hash_table_new (rspamd_strcase_hash,
+						rspamd_strcase_equal);
 			}
 
-#ifdef WITH_SNOWBALL
-			static GHashTable *stemmers = NULL;
+			stem = g_hash_table_lookup (stemmers, part->language);
 
-			if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
+			if (stem == NULL) {
 
-				if (!stemmers) {
-					stemmers = g_hash_table_new (rspamd_strcase_hash,
-							rspamd_strcase_equal);
-				}
-
-				stem = g_hash_table_lookup (stemmers, part->language);
+				stem = sb_stemmer_new (part->language, "UTF_8");
 
 				if (stem == NULL) {
-
-					stem = sb_stemmer_new (part->language, "UTF_8");
-
-					if (stem == NULL) {
-						msg_debug_task ("<%s> cannot create lemmatizer for %s language",
-								task->message_id, part->language);
-					}
-					else {
-						g_hash_table_insert (stemmers, g_strdup (part->language),
-								stem);
-					}
+					msg_debug_task ("<%s> cannot create lemmatizer for %s language",
+							task->message_id, part->language);
+				}
+				else {
+					g_hash_table_insert (stemmers, g_strdup (part->language),
+							stem);
 				}
 			}
-#endif
 		}
+#endif
+	}
 
-		for (i = 0; i < part->normalized_words->len; i ++) {
-			guint64 h;
+	for (i = 0; i < part->normalized_words->len; i ++) {
+		guint64 h;
 
-			w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
-			r = NULL;
+		w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+		r = NULL;
 #ifdef WITH_SNOWBALL
-			if (stem) {
-				r = sb_stemmer_stem (stem, w->begin, w->len);
-			}
+		if (stem) {
+			r = sb_stemmer_stem (stem, w->begin, w->len);
+		}
 #endif
 
-			if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
-				avg_len = avg_len + (w->len - avg_len) / (double)i;
-
-				if (r != NULL) {
-					nlen = strlen (r);
-					nlen = MIN (nlen, w->len);
-					temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
-					memcpy (temp_word, r, nlen);
+		if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
+			avg_len = avg_len + (w->len - avg_len) / (double)i;
 
-					if (IS_PART_UTF (part)) {
-						rspamd_str_lc_utf8 (temp_word, nlen);
-					}
-					else {
-						rspamd_str_lc (temp_word, nlen);
-					}
+			if (r != NULL) {
+				nlen = strlen (r);
+				nlen = MIN (nlen, w->len);
+				temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
+				memcpy (temp_word, r, nlen);
 
-					w->begin = temp_word;
-					w->len = nlen;
+				if (IS_PART_UTF (part)) {
+					rspamd_str_lc_utf8 (temp_word, nlen);
 				}
 				else {
-					temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
-					memcpy (temp_word, w->begin, w->len);
+					rspamd_str_lc (temp_word, nlen);
+				}
 
-					if (IS_PART_UTF (part)) {
-						rspamd_str_lc_utf8 (temp_word, w->len);
-					}
-					else {
-						rspamd_str_lc (temp_word, w->len);
-					}
+				w->begin = temp_word;
+				w->len = nlen;
+			}
+			else {
+				temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
+				memcpy (temp_word, w->begin, w->len);
 
-					w->begin = temp_word;
+				if (IS_PART_UTF (part)) {
+					rspamd_str_lc_utf8 (temp_word, w->len);
+				}
+				else {
+					rspamd_str_lc (temp_word, w->len);
 				}
+
+				w->begin = temp_word;
 			}
+		}
 
-			if (w->len > 0) {
-				/*
-				 * We use static hash seed if we would want to use that in shingles
-				 * computation in future
-				 */
-				h = rspamd_cryptobox_fast_hash_specific (
-						RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
-						w->begin, w->len, words_hash_seed);
-				g_array_append_val (part->normalized_hashes, h);
-				total_len += w->len;
-
-				if (w->len <= 3) {
-					short_len ++;
-				}
+		if (w->len > 0) {
+			/*
+			 * We use static hash seed if we would want to use that in shingles
+			 * computation in future
+			 */
+			h = rspamd_cryptobox_fast_hash_specific (
+					RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
+					w->begin, w->len, words_hash_seed);
+			g_array_append_val (part->normalized_hashes, h);
+			total_len += w->len;
+
+			if (w->len <= 3) {
+				short_len ++;
 			}
 		}
 	}
@@ -239,8 +188,78 @@ rspamd_extract_words (struct rspamd_task *task,
 			*short_len_p += short_len;
 		}
 	}
+}
+
+static guint
+rspamd_mime_part_create_words (struct rspamd_task *task,
+		struct rspamd_mime_text_part *part)
+{
+	rspamd_stat_token_t *w, ucs_w;
+	guint i, ucs_len = 0;
+
+	/* Ugly workaround */
+	if (IS_PART_HTML (part)) {
+		part->normalized_words = rspamd_tokenize_text (
+				part->stripped_content->data,
+				part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+				part->exceptions, FALSE,
+				NULL);
+	}
+	else {
+		part->normalized_words = rspamd_tokenize_text (
+				part->stripped_content->data,
+				part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+				part->exceptions, FALSE,
+				NULL);
+	}
+	if (part->normalized_words) {
+		part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
+				sizeof (guint64), part->normalized_words->len);
+
+		if (IS_PART_UTF (part) && task->lang_det) {
+			part->ucs32_words = g_array_sized_new (FALSE, FALSE,
+					sizeof (rspamd_stat_token_t), part->normalized_words->len);
+		}
+
+		if (part->ucs32_words) {
+
+
+			for (i = 0; i < part->normalized_words->len; i++) {
+				w = &g_array_index (part->normalized_words, rspamd_stat_token_t,
+						i);
+
+				if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+					rspamd_language_detector_to_ucs (task->lang_det,
+							task->task_pool,
+							w, &ucs_w);
+					g_array_append_val (part->ucs32_words, ucs_w);
+					ucs_len += ucs_w.len;
+				}
+			}
+		}
+	}
+
+	return ucs_len;
+}
+
+static void
+rspamd_mime_part_detect_language (struct rspamd_task *task,
+		struct rspamd_mime_text_part *part, guint ucs_len)
+{
+	struct rspamd_lang_detector_res *lang;
 
+	if (part->ucs32_words) {
+		part->languages = rspamd_language_detector_detect (task,
+				task->lang_det,
+				part->ucs32_words, ucs_len);
 
+		if (part->languages->len > 0) {
+			lang = g_ptr_array_index (part->languages, 0);
+			part->language = lang->lang;
+
+			msg_info_task ("detected part language: %s", part->language);
+		}
+	}
 }
 
 static void
@@ -833,7 +852,7 @@ rspamd_message_process_text_part (struct rspamd_task *task,
 				text_part->exceptions);
 	}
 
-	rspamd_extract_words (task, text_part);
+	text_part->ucs_len = rspamd_mime_part_create_words (task, text_part);
 }
 
 /* Creates message from various data using libmagic to detect type */
@@ -1128,6 +1147,41 @@ rspamd_message_parse (struct rspamd_task *task)
 			if (rspamd_ftok_cmp (&p1->mime_part->parent_part->ct->subtype, &srch) == 0) {
 				if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
 						p1->normalized_hashes && p2->normalized_hashes) {
+					/*
+					 * We also detect language on one part and propagate it to
+					 * another one
+					 */
+					struct rspamd_mime_text_part *sel;
+
+					/* Prefer HTML as text part is not displayed normally */
+					if (IS_PART_HTML (p1)) {
+						sel = p1;
+					}
+					else if (IS_PART_HTML (p2)) {
+						sel = p2;
+					}
+					else {
+						if (p1->ucs_len > p2->ucs_len) {
+							sel = p1;
+						}
+						else {
+							sel = p2;
+						}
+					}
+
+					rspamd_mime_part_detect_language (task, sel, sel->ucs_len);
+
+					if (sel->language && sel->language[0]) {
+						/* Propagate language */
+						if (sel == p1) {
+							p2->language = sel->language;
+							p2->languages = g_ptr_array_ref (sel->languages);
+						}
+						else {
+							p1->language = sel->language;
+							p1->languages = g_ptr_array_ref (sel->languages);
+						}
+					}
 
 					tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
 
@@ -1180,6 +1234,12 @@ rspamd_message_parse (struct rspamd_task *task)
 	guint total_words = 0;
 
 	PTR_ARRAY_FOREACH (task->text_parts, i, text_part) {
+		if (!text_part->language) {
+			rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len);
+		}
+
+		rspamd_mime_part_extract_words (task, text_part);
+
 		if (text_part->normalized_words) {
 			total_words += text_part->normalized_words->len;
 		}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 5ee5b4c43..0ed2a5c66 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -103,6 +103,7 @@ struct rspamd_mime_text_part {
 	guint empty_lines;
 	guint capital_letters;
 	guint numeric_characters;
+	guint ucs_len;
 };
 
 enum rspamd_received_type {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-04-06 17:32:15 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2018-04-06 17:32:15 +0100
commit	6e925ae527b87d75c8346d44c27b55252cc78dbf (patch)
tree	b5bb829351ffd70028d8563d25503ef419953b1c /src/libmime
parent	b4e985a0b9a0ad8e79a3c5c55191798e3cb32e0d (diff)
download	rspamd-6e925ae527b87d75c8346d44c27b55252cc78dbf.tar.gz rspamd-6e925ae527b87d75c8346d44c27b55252cc78dbf.zip