- Store unicode in UTF parts - Store unicode for HTML parts - Rename struct fields and split them into unicode/utf components

5 years ago · a64ce9b424
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1323,7 +1323,7 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
 GPtrArray *
 rspamd_language_detector_detect (struct rspamd_task *task,
 		struct rspamd_lang_detector *d,
 		GArray *ucs_tokens, gsize words_len)
 		GArray *ucs_tokens)
 {
 	khash_t(rspamd_candidates_hash) *candidates;
 	GPtrArray *result;
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -61,6 +61,6 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
 */
 GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task,
 		struct rspamd_lang_detector *d,
 		GArray *ucs_tokens, gsize words_len);
 		GArray *ucs_tokens);

 #endif
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -67,7 +67,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
 	guint i, nlen, total_len = 0, short_len = 0;
 	gdouble avg_len = 0;

 	if (part->normalized_words) {
 	if (part->utf_words) {
 #ifdef WITH_SNOWBALL
 		static GHashTable *stemmers = NULL;

@@ -97,10 +97,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
 #endif


 		for (i = 0; i < part->normalized_words->len; i++) {
 		for (i = 0; i < part->utf_words->len; i++) {
 			guint64 h;

 			w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
 			w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
 			r = NULL;
 #ifdef WITH_SNOWBALL
 			if (stem) {
@@ -156,7 +156,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
 			}
 		}

 		if (part->normalized_words && part->normalized_words->len) {
 		if (part->utf_words && part->utf_words->len) {
 			gdouble *avg_len_p, *short_len_p;

 			avg_len_p = rspamd_mempool_get_variable (task->task_pool,
@@ -205,41 +205,41 @@ rspamd_mime_part_create_words (struct rspamd_task *task,

 	/* Ugly workaround */
 	if (IS_PART_HTML (part)) {
 		part->normalized_words = rspamd_tokenize_text (
 				part->stripped_content->data,
 				part->stripped_content->len, tok_type, task->cfg,
 		part->utf_words = rspamd_tokenize_text (
 				part->utf_stripped_content->data,
 				part->utf_stripped_content->len, tok_type, task->cfg,
 				part->exceptions,
 				NULL);
 	}
 	else {
 		part->normalized_words = rspamd_tokenize_text (
 				part->stripped_content->data,
 				part->stripped_content->len, tok_type, task->cfg,
 		part->utf_words = rspamd_tokenize_text (
 				part->utf_stripped_content->data,
 				part->utf_stripped_content->len, tok_type, task->cfg,
 				part->exceptions,
 				NULL);
 	}

 	if (part->normalized_words) {
 	if (part->utf_words) {
 		part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
 				sizeof (guint64), part->normalized_words->len);
 				sizeof (guint64), part->utf_words->len);

 		if (IS_PART_UTF (part) && task->lang_det) {
 			part->ucs32_words = g_array_sized_new (FALSE, FALSE,
 					sizeof (rspamd_stat_token_t), part->normalized_words->len);
 			part->unicode_words = g_array_sized_new (FALSE, FALSE,
 					sizeof (rspamd_stat_token_t), part->utf_words->len);
 		}

 		if (part->ucs32_words) {
 		if (part->unicode_words) {


 			for (i = 0; i < part->normalized_words->len; i++) {
 				w = &g_array_index (part->normalized_words, rspamd_stat_token_t,
 			for (i = 0; i < part->utf_words->len; i++) {
 				w = &g_array_index (part->utf_words, rspamd_stat_token_t,
 						i);

 				if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
 					rspamd_language_detector_to_ucs (task->lang_det,
 							task->task_pool,
 							w, &ucs_w);
 					g_array_append_val (part->ucs32_words, ucs_w);
 					g_array_append_val (part->unicode_words, ucs_w);
 					ucs_len += ucs_w.len;
 				}
 			}
@@ -251,14 +251,14 @@ rspamd_mime_part_create_words (struct rspamd_task *task,

 static void
 rspamd_mime_part_detect_language (struct rspamd_task *task,
 		struct rspamd_mime_text_part *part, guint ucs_len)
 		struct rspamd_mime_text_part *part)
 {
 	struct rspamd_lang_detector_res *lang;

 	if (part->ucs32_words) {
 	if (part->unicode_words) {
 		part->languages = rspamd_language_detector_detect (task,
 				task->lang_det,
 				part->ucs32_words, ucs_len);
 				part->unicode_words);

 		if (part->languages->len > 0) {
 			lang = g_ptr_array_index (part->languages, 0);
@@ -289,7 +289,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 				state = seen_cr;
 				if (p > c) {
 					last_c = *(p - 1);
 					g_byte_array_append (part->stripped_content,
 					g_byte_array_append (part->utf_stripped_content,
 							(const guint8 *)c, p - c);
 				}

@@ -299,11 +299,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 			case seen_cr:
 				/* Double \r\r */
 				if (!crlf_added) {
 					g_byte_array_append (part->stripped_content,
 					g_byte_array_append (part->utf_stripped_content,
 							(const guint8 *)" ", 1);
 					crlf_added = TRUE;
 					g_ptr_array_add (part->newlines,
 							(((gpointer) (goffset) (part->stripped_content->len))));
 							(((gpointer) (goffset) (part->utf_stripped_content->len))));
 				}

 				part->nlines ++;
@@ -326,17 +326,17 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,

 				if (p > c) {
 					last_c = *(p - 1);
 					g_byte_array_append (part->stripped_content,
 					g_byte_array_append (part->utf_stripped_content,
 							(const guint8 *)c, p - c);
 				}

 				c = p + 1;

 				if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
 					g_byte_array_append (part->stripped_content,
 					g_byte_array_append (part->utf_stripped_content,
 							(const guint8 *)" ", 1);
 					g_ptr_array_add (part->newlines,
 							(((gpointer) (goffset) (part->stripped_content->len))));
 							(((gpointer) (goffset) (part->utf_stripped_content->len))));
 					crlf_added = TRUE;
 				}
 				else {
@@ -348,13 +348,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 				/* \r\n */
 				if (!crlf_added) {
 					if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
 						g_byte_array_append (part->stripped_content,
 						g_byte_array_append (part->utf_stripped_content,
 								(const guint8 *) " ", 1);
 						crlf_added = TRUE;
 					}

 					g_ptr_array_add (part->newlines,
 							(((gpointer) (goffset) (part->stripped_content->len))));
 							(((gpointer) (goffset) (part->utf_stripped_content->len))));
 				}

 				c = p + 1;
@@ -364,11 +364,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 			case seen_lf:
 				/* Double \n\n */
 				if (!crlf_added) {
 					g_byte_array_append (part->stripped_content,
 					g_byte_array_append (part->utf_stripped_content,
 							(const guint8 *)" ", 1);
 					crlf_added = TRUE;
 					g_ptr_array_add (part->newlines,
 							(((gpointer) (goffset) (part->stripped_content->len))));
 							(((gpointer) (goffset) (part->utf_stripped_content->len))));
 				}

 				part->nlines++;
@@ -414,13 +414,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,

 				if (!crlf_added) {
 					g_ptr_array_add (part->newlines,
 							(((gpointer) (goffset) (part->stripped_content->len))));
 							(((gpointer) (goffset) (part->utf_stripped_content->len))));
 				}

 				/* Skip initial spaces */
 				if (G_UNLIKELY (*p == ' ')) {
 					if (!crlf_added) {
 						g_byte_array_append (part->stripped_content,
 						g_byte_array_append (part->utf_stripped_content,
 								(const guint8 *)" ", 1);
 					}

@@ -451,7 +451,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,

 		switch (state) {
 		case normal_char:
 			g_byte_array_append (part->stripped_content,
 			g_byte_array_append (part->utf_stripped_content,
 					(const guint8 *)c, p - c);

 			while (c < p) {
@@ -479,10 +479,10 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 		default:

 			if (!crlf_added) {
 				g_byte_array_append (part->stripped_content,
 				g_byte_array_append (part->utf_stripped_content,
 						(const guint8 *)" ", 1);
 				g_ptr_array_add (part->newlines,
 						(((gpointer) (goffset) (part->stripped_content->len))));
 						(((gpointer) (goffset) (part->utf_stripped_content->len))));
 			}

 			part->nlines++;
@@ -502,10 +502,10 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 	struct rspamd_process_exception *ex;

 	/* Strip newlines */
 	part->stripped_content = g_byte_array_sized_new (part->content->len);
 	part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
 	part->newlines = g_ptr_array_sized_new (128);
 	p = (const gchar *)part->content->data;
 	end = p + part->content->len;
 	p = (const gchar *)part->utf_content->data;
 	end = p + part->utf_content->len;

 	rspamd_strip_newlines_parse (p, end, part);

@@ -513,7 +513,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 		ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
 		off = (goffset)g_ptr_array_index (part->newlines, i);
 		g_ptr_array_index (part->newlines, i) = (gpointer)(goffset)
 				(part->stripped_content->data + off);
 				(part->utf_stripped_content->data + off);
 		ex->pos = off;
 		ex->len = 0;
 		ex->type = RSPAMD_EXCEPTION_NEWLINE;
@@ -522,7 +522,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,

 	rspamd_mempool_add_destructor (task->task_pool,
 			(rspamd_mempool_destruct_t) free_byte_array_callback,
 			part->stripped_content);
 			part->utf_stripped_content);
 	rspamd_mempool_add_destructor (task->task_pool,
 			(rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
 			part->newlines);
@@ -615,10 +615,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
 		g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
 	}

 	if (part->content && part->content->len >= sizeof (gtube_pattern_reject) &&
 			part->content->len <= max_check_size) {
 		if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->content->data,
 				part->content->len,
 	if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
 			part->utf_content->len <= max_check_size) {
 		if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
 				part->utf_content->len,
 				rspamd_multipattern_gtube_cb, NULL, NULL)) > 0) {

 			switch (ret) {
@@ -639,7 +639,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
 				msg_info_task (
 						"<%s>: gtube %s pattern has been found in part of length %ud",
 						task->message_id, rspamd_action_to_str (act),
 						part->content->len);
 						part->utf_content->len);
 			}
 		}
 	}
@@ -655,9 +655,86 @@ exceptions_compare_func (gconstpointer a, gconstpointer b)
 	return ea->pos - eb->pos;
 }

 static gboolean
 rspamd_message_process_plain_text_part (struct rspamd_task *task,
 										struct rspamd_mime_text_part *text_part)
 {
 	if (text_part->parsed.len == 0) {
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;

 		return TRUE;
 	}

 	rspamd_mime_text_part_maybe_convert (task, text_part);

 	if (text_part->utf_raw_content != NULL) {
 		/* Different from HTML, where we also parse HTML and strip tags */
 		text_part->utf_content = text_part->utf_raw_content;
 		text_part->unicode_content = text_part->unicode_raw_content;
 	}
 	else {
 		/*
 		 * We ignore unconverted parts from now as it is dangerous
 		 * to treat them as text parts
 		 */

 		return FALSE;
 	}

 	return TRUE;
 }

 static gboolean
 rspamd_message_process_html_text_part (struct rspamd_task *task,
 										struct rspamd_mime_text_part *text_part)
 {
 	text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;

 	if (text_part->parsed.len == 0) {
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;

 		return TRUE;
 	}

 	rspamd_mime_text_part_maybe_convert (task, text_part);

 	if (text_part->utf_raw_content == NULL) {
 		return FALSE;
 	}

 	text_part->html = rspamd_mempool_alloc0 (task->task_pool,
 			sizeof (*text_part->html));
 	text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
 	text_part->utf_content = rspamd_html_process_part_full (
 			task->task_pool,
 			text_part->html,
 			text_part->utf_raw_content,
 			&text_part->exceptions,
 			task->urls,
 			task->emails);

 	if (text_part->utf_content->len == 0) {
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
 	}

 	/* Also add unicode content */
 	text_part->unicode_content =  g_array_sized_new (FALSE, FALSE,
 			sizeof (UChar), text_part->utf_content->len + 1);
 	rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content);

 	rspamd_mempool_add_destructor (task->task_pool,
 			(rspamd_mempool_destruct_t) free_byte_array_callback,
 			text_part->utf_content);
 	rspamd_mempool_add_destructor (task->task_pool,
 			rspamd_array_free_hard,
 			text_part->unicode_content);

 	return TRUE;
 }

 static void
 rspamd_message_process_text_part (struct rspamd_task *task,
 	struct rspamd_mime_part *mime_part)
 rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 										struct rspamd_mime_part *mime_part)
 {
 	struct rspamd_mime_text_part *text_part;
 	rspamd_ftok_t html_tok, xhtml_tok;
@@ -738,87 +815,31 @@ rspamd_message_process_text_part (struct rspamd_task *task,
 		debug_task ("skip attachments for checking as text parts");
 		return;
 	}

 	if (found_html) {
 		text_part = rspamd_mempool_alloc0 (task->task_pool,
 				sizeof (struct rspamd_mime_text_part));
 		text_part->raw.begin = mime_part->raw_data.begin;
 		text_part->raw.len = mime_part->raw_data.len;
 		text_part->parsed.begin = mime_part->parsed_data.begin;
 		text_part->parsed.len = mime_part->parsed_data.len;
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
 		text_part->mime_part = mime_part;

 		if (mime_part->parsed_data.len == 0) {
 			text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
 			g_ptr_array_add (task->text_parts, text_part);
 			return;
 		}

 		rspamd_mime_text_part_maybe_convert (task, text_part);

 		if (text_part->utf_raw_content == NULL) {
 			return;
 		}

 		text_part->html = rspamd_mempool_alloc0 (task->task_pool,
 				sizeof (*text_part->html));
 		text_part->mime_part = mime_part;

 		text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
 		text_part->content = rspamd_html_process_part_full (
 				task->task_pool,
 				text_part->html,
 				text_part->utf_raw_content,
 				&text_part->exceptions,
 				task->urls,
 				task->emails);

 		if (text_part->content->len == 0) {
 			text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
 		}

 		rspamd_mempool_add_destructor (task->task_pool,
 			(rspamd_mempool_destruct_t) free_byte_array_callback,
 			text_part->content);
 		g_ptr_array_add (task->text_parts, text_part);
 	else if (!(found_txt || found_html)) {
 		/* Not a text part */
 		return;
 	}
 	else if (found_txt) {
 		text_part =
 			rspamd_mempool_alloc0 (task->task_pool,
 				sizeof (struct rspamd_mime_text_part));
 		text_part->mime_part = mime_part;
 		text_part->raw.begin = mime_part->raw_data.begin;
 		text_part->raw.len = mime_part->raw_data.len;
 		text_part->parsed.begin = mime_part->parsed_data.begin;
 		text_part->parsed.len = mime_part->parsed_data.len;
 		text_part->mime_part = mime_part;

 		if (mime_part->parsed_data.len == 0) {
 			text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
 			g_ptr_array_add (task->text_parts, text_part);
 			return;
 		}

 		rspamd_mime_text_part_maybe_convert (task, text_part);
 	text_part = rspamd_mempool_alloc0 (task->task_pool,
 			sizeof (struct rspamd_mime_text_part));
 	text_part->mime_part = mime_part;
 	text_part->raw.begin = mime_part->raw_data.begin;
 	text_part->raw.len = mime_part->raw_data.len;
 	text_part->parsed.begin = mime_part->parsed_data.begin;
 	text_part->parsed.len = mime_part->parsed_data.len;

 		if (text_part->utf_raw_content != NULL) {
 			/*
 			 * We ignore unconverted parts from now as it is dangerous
 			 * to treat them as text parts
 			 */
 			text_part->content = text_part->utf_raw_content;
 			g_ptr_array_add (task->text_parts, text_part);
 		}
 		else {
 	if (found_html) {
 		if (!rspamd_message_process_html_text_part (task, text_part)) {
 			return;
 		}
 	}
 	else {
 		return;
 		if (!rspamd_message_process_plain_text_part (task, text_part)) {
 			return;
 		}
 	}


 	g_ptr_array_add (task->text_parts, text_part);
 	mime_part->flags |= RSPAMD_MIME_PART_TEXT;
 	mime_part->specific.txt = text_part;

@@ -867,7 +888,7 @@ rspamd_message_process_text_part (struct rspamd_task *task,
 				text_part->exceptions);
 	}

 	text_part->ucs_len = rspamd_mime_part_create_words (task, text_part);
 	rspamd_mime_part_create_words (task, text_part);
 }

 /* Creates message from various data using libmagic to detect type */
@@ -1172,7 +1193,7 @@ rspamd_message_process (struct rspamd_task *task)
 		struct rspamd_mime_part *part;

 		part = g_ptr_array_index (task->parts, i);
 		rspamd_message_process_text_part (task, part);
 		rspamd_message_process_text_part_maybe (task, part);
 	}

 	rspamd_images_process (task);
@@ -1207,7 +1228,7 @@ rspamd_message_process (struct rspamd_task *task)
 						sel = p2;
 					}
 					else {
 						if (p1->ucs_len > p2->ucs_len) {
 						if (p1->unicode_content->len > p2->unicode_content->len) {
 							sel = p1;
 						}
 						else {
@@ -1215,7 +1236,7 @@ rspamd_message_process (struct rspamd_task *task)
 						}
 					}

 					rspamd_mime_part_detect_language (task, sel, sel->ucs_len);
 					rspamd_mime_part_detect_language (task, sel);

 					if (sel->language && sel->language[0]) {
 						/* Propagate language */
@@ -1274,13 +1295,13 @@ rspamd_message_process (struct rspamd_task *task)

 	PTR_ARRAY_FOREACH (task->text_parts, i, text_part) {
 		if (!text_part->language) {
 			rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len);
 			rspamd_mime_part_detect_language (task, text_part);
 		}

 		rspamd_mime_part_extract_words (task, text_part);

 		if (text_part->normalized_words) {
 			total_words += text_part->normalized_words->len;
 		if (text_part->utf_words) {
 			total_words += text_part->utf_words->len;
 		}
 	}

--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -86,20 +86,28 @@ struct rspamd_mime_text_part {
 	const gchar *language;
 	GPtrArray *languages;
 	const gchar *real_charset;

 	/* Raw data in native encoding */
 	rspamd_ftok_t raw;
 	rspamd_ftok_t parsed; /* decoded from mime encodings */
 	GByteArray *content; /* utf8 encoded processed content */

 	GArray *ucs_raw_content; /* unicode raw content (of UChar) */
 	/* UTF8 content */
 	GByteArray *utf_content; /* utf8 encoded processed content */
 	GByteArray *utf_raw_content; /* utf raw content */
 	GByteArray *stripped_content; /* utf content with no newlines */
 	GByteArray *utf_stripped_content; /* utf content with no newlines */
 	GArray *normalized_hashes;
 	GArray *utf_words;

 	/* Unicode content, used by libicu */
 	GArray *unicode_raw_content; /* unicode raw content (of UChar) */
 	GArray *unicode_content; /* unicode processed content (of UChar) */
 	GArray *unicode_words;

 	GPtrArray *newlines;	/**< positions of newlines in text, relative to content*/
 	struct html_content *html;
 	GList *exceptions;	/**< list of offsets of urls						*/
 	struct rspamd_mime_part *mime_part;
 	GArray *normalized_words;
 	GArray *ucs32_words;
 	GArray *normalized_hashes;

 	guint flags;
 	guint nlines;
 	guint spaces;
@@ -110,7 +118,6 @@ struct rspamd_mime_text_part {
 	guint empty_lines;
 	guint capital_letters;
 	guint numeric_characters;
 	guint ucs_len;
 };

 enum rspamd_received_type {
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -283,18 +283,18 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,

 	rspamd_mime_utf8_conv_init ();
 	utf = text_part->utf_raw_content;
 	text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
 	text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
 			sizeof (UChar), utf->len + 1);
 	text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter,
 			(UChar *)text_part->ucs_raw_content->data,
 	text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter,
 			(UChar *)text_part->unicode_raw_content->data,
 			utf->len + 1,
 			utf->data,
 			utf->len,
 			&uc_err);

 	if (!U_SUCCESS (uc_err)) {
 		g_array_free (text_part->ucs_raw_content, TRUE);
 		text_part->ucs_raw_content = NULL;
 		g_array_free (text_part->unicode_raw_content, TRUE);
 		text_part->unicode_raw_content = NULL;
 	}
 }

@@ -311,12 +311,12 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
 		norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
 	}

 	if (!text_part->ucs_raw_content) {
 	if (!text_part->unicode_raw_content) {
 		return;
 	}

 	src = (UChar *)text_part->ucs_raw_content->data;
 	nsym = text_part->ucs_raw_content->len;
 	src = (UChar *)text_part->unicode_raw_content->data;
 	nsym = text_part->unicode_raw_content->len;

 	/* We can now check if we need to decompose */
 	end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
@@ -346,8 +346,8 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
 	}
 	else {
 		/* Copy normalised back */
 		memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar));
 		text_part->ucs_raw_content->len = nsym;
 		memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar));
 		text_part->unicode_raw_content->len = nsym;
 		text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
 	}

@@ -369,16 +369,16 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
 	rspamd_mime_utf8_conv_init ();

 	if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
 		text_part->ucs_raw_content) {
 		text_part->unicode_raw_content) {
 		clen = ucnv_getMaxCharSize (utf8_converter);
 		dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len,
 		dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len,
 				clen);
 		g_byte_array_set_size (text_part->utf_raw_content, dlen);
 		r = ucnv_fromUChars (utf8_converter,
 				text_part->utf_raw_content->data,
 				dlen,
 				(UChar *)text_part->ucs_raw_content->data,
 				text_part->ucs_raw_content->len,
 				(UChar *)text_part->unicode_raw_content->data,
 				text_part->unicode_raw_content->len,
 				&uc_err);
 		text_part->utf_raw_content->len = r;
 	}
@@ -410,10 +410,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
 	}


 	text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
 	text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
 			sizeof (UChar), input->len + 1);
 	r = ucnv_toUChars (conv,
 			(UChar *)text_part->ucs_raw_content->data,
 			(UChar *)text_part->unicode_raw_content->data,
 			input->len + 1,
 			input->data,
 			input->len,
@@ -426,7 +426,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
 		return FALSE;
 	}

 	text_part->ucs_raw_content->len = r;
 	text_part->unicode_raw_content->len = r;
 	rspamd_mime_text_part_normalise (task, text_part);

 	/* Now, convert to utf8 */
@@ -434,7 +434,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
 	dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
 	d = rspamd_mempool_alloc (task->task_pool, dlen);
 	r = ucnv_fromUChars (utf8_converter, d, dlen,
 			(UChar *)text_part->ucs_raw_content->data, r, &uc_err);
 			(UChar *)text_part->unicode_raw_content->data, r, &uc_err);

 	if (!U_SUCCESS (uc_err)) {
 		g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
@@ -750,3 +750,17 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,

 	SET_PART_UTF (text_part);
 }

 void
 rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
 {
 	UErrorCode uc_err = U_ZERO_ERROR;

 	g_array_set_size (dest, in->len + 1);
 	dest->len = ucnv_toUChars (utf8_converter,
 			(UChar *)dest->data,
 			in->len + 1,
 			in->data,
 			in->len,
 			&uc_err);
 }
--- a/src/libmime/mime_encoding.h
+++ b/src/libmime/mime_encoding.h
@@ -86,4 +86,11 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
 */
 void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);

 /**
 * Converts utf8 to libicu unichars
 * @param in
 * @param dest
 */
 void rspamd_utf_to_unicode (GByteArray *in, GArray *dest);

 #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -905,8 +905,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
 							raw = TRUE;
 						}

 						in = part->content->data;
 						len = part->content->len;
 						in = part->utf_content->data;
 						len = part->utf_content->len;
 					}
 				}

@@ -1006,9 +1006,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
 		for (i = 0; i < task->text_parts->len; i++) {
 			part = g_ptr_array_index (task->text_parts, i);

 			if (part->stripped_content) {
 				scvec[i + 1] = (guchar *)part->stripped_content->data;
 				lenvec[i + 1] = part->stripped_content->len;
 			if (part->utf_stripped_content) {
 				scvec[i + 1] = (guchar *)part->utf_stripped_content->data;
 				lenvec[i + 1] = part->utf_stripped_content->len;
 			}
 			else {
 				scvec[i + 1] = (guchar *)"";
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -242,20 +242,20 @@ rspamd_task_free (struct rspamd_task *task)
 		for (i = 0; i < task->text_parts->len; i ++) {
 			tp = g_ptr_array_index (task->text_parts, i);

 			if (tp->normalized_words) {
 				g_array_free (tp->normalized_words, TRUE);
 			if (tp->utf_words) {
 				g_array_free (tp->utf_words, TRUE);
 			}
 			if (tp->normalized_hashes) {
 				g_array_free (tp->normalized_hashes, TRUE);
 			}
 			if (tp->ucs32_words) {
 				g_array_free (tp->ucs32_words, TRUE);
 			if (tp->unicode_words) {
 				g_array_free (tp->unicode_words, TRUE);
 			}
 			if (tp->languages) {
 				g_ptr_array_unref (tp->languages);
 			}
 			if (tp->ucs_raw_content) {
 				g_array_free (tp->ucs_raw_content, TRUE);
 			if (tp->unicode_raw_content) {
 				g_array_free (tp->unicode_raw_content, TRUE);
 			}
 		}

--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2624,7 +2624,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
 {
 	struct rspamd_url_mimepart_cbdata mcbd;

 	if (part->stripped_content == NULL || part->stripped_content->len == 0) {
 	if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) {
 		msg_warn_task ("got empty text part");
 		return;
 	}
@@ -2632,8 +2632,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
 	mcbd.task = task;
 	mcbd.part = part;

 	rspamd_url_find_multiple (task->task_pool, part->stripped_content->data,
 			part->stripped_content->len, is_html, part->newlines,
 	rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
 			part->utf_stripped_content->len, is_html, part->newlines,
 			rspamd_url_text_part_callback, &mcbd);
 }

--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -331,8 +331,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 	for (i = 0; i < task->text_parts->len; i++) {
 		part = g_ptr_array_index (task->text_parts, i);

 		if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
 			reserved_len += part->normalized_words->len;
 		if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
 			reserved_len += part->utf_words->len;
 		}
 		/* XXX: normal window size */
 		reserved_len += 5;
@@ -346,9 +346,9 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 	for (i = 0; i < task->text_parts->len; i ++) {
 		part = g_ptr_array_index (task->text_parts, i);

 		if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
 		if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
 			st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
 					part->normalized_words, IS_PART_UTF (part),
 					part->utf_words, IS_PART_UTF (part),
 					NULL, task->tokens);
 		}

--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -59,7 +59,7 @@ const gchar t_delimiters[255] = {

 /* Get next word from specified f_str_t buf */
 static gboolean
 rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
 rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
 		gchar const **cur, rspamd_stat_token_t * token,
 		GList **exceptions, gsize *rl, gboolean unused)
 {
@@ -149,7 +149,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
 }

 static gboolean
 rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
 rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
 		gchar const **cur, rspamd_stat_token_t * token,
 		GList **exceptions, gsize *rl,
 		gboolean check_signature)
@@ -355,10 +355,10 @@ rspamd_tokenize_text (const gchar *text, gsize len,

 	switch (how) {
 	case RSPAMD_TOKENIZE_RAW:
 		func = rspamd_tokenizer_get_word_compat;
 		func = rspamd_tokenizer_get_word_raw;
 		break;
 	case RSPAMD_TOKENIZE_UTF:
 		func = rspamd_tokenizer_get_word;
 		func = rspamd_tokenizer_get_word_utf8;
 		break;
 	default:
 		g_assert_not_reached ();
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -28,7 +28,7 @@ struct rspamd_stat_tokenizer {
 enum rspamd_tokenize_type {
 	RSPAMD_TOKENIZE_UTF = 0,
 	RSPAMD_TOKENIZE_RAW,
 	RSPAMD_TOKENIZE_UCS
 	RSPAMD_TOKENIZE_UNICODE
 };

 /* Compare two token nodes */
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -549,16 +549,16 @@ lua_textpart_get_content (lua_State * L)
 	rspamd_lua_setclass (L, "rspamd{text}", -1);

 	if (!type) {
 		start = part->content->data;
 		len = part->content->len;
 		start = part->utf_content->data;
 		len = part->utf_content->len;
 	}
 	else if (strcmp (type, "content") == 0) {
 		start = part->content->data;
 		len = part->content->len;
 		start = part->utf_content->data;
 		len = part->utf_content->len;
 	}
 	else if (strcmp (type, "content_oneline") == 0) {
 		start = part->stripped_content->data;
 		len = part->stripped_content->len;
 		start = part->utf_stripped_content->data;
 		len = part->utf_stripped_content->len;
 	}
 	else if (strcmp (type, "raw_parsed") == 0) {
 		start = part->parsed.begin;
@@ -618,8 +618,8 @@ lua_textpart_get_content_oneline (lua_State * L)

 	t = lua_newuserdata (L, sizeof (*t));
 	rspamd_lua_setclass (L, "rspamd{text}", -1);
 	t->start = part->stripped_content->data;
 	t->len = part->stripped_content->len;
 	t->start = part->utf_stripped_content->data;
 	t->len = part->utf_stripped_content->len;
 	t->flags = 0;

 	return 1;
@@ -636,11 +636,11 @@ lua_textpart_get_length (lua_State * L)
 		return 1;
 	}

 	if (IS_PART_EMPTY (part) || part->content == NULL) {
 	if (IS_PART_EMPTY (part) || part->utf_content == NULL) {
 		lua_pushinteger (L, 0);
 	}
 	else {
 		lua_pushinteger (L, part->content->len);
 		lua_pushinteger (L, part->utf_content->len);
 	}

 	return 1;
@@ -721,11 +721,11 @@ lua_textpart_get_words_count (lua_State *L)
 		return 1;
 	}

 	if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
 	if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
 		lua_pushinteger (L, 0);
 	}
 	else {
 		lua_pushinteger (L, part->normalized_words->len);
 		lua_pushinteger (L, part->utf_words->len);
 	}

 	return 1;
@@ -743,14 +743,14 @@ lua_textpart_get_words (lua_State *L)
 		return luaL_error (L, "invalid arguments");
 	}

 	if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
 	if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
 		lua_createtable (L, 0, 0);
 	}
 	else {
 		lua_createtable (L, part->normalized_words->len, 0);
 		lua_createtable (L, part->utf_words->len, 0);

 		for (i = 0; i < part->normalized_words->len; i ++) {
 			w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
 		for (i = 0; i < part->utf_words->len; i ++) {
 			w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);

 			lua_pushlstring (L, w->begin, w->len);
 			lua_rawseti (L, -2, i + 1);
@@ -876,8 +876,8 @@ struct lua_shingle_data {
 };

 #define STORE_TOKEN(i, t) do { \
    if ((i) < part->normalized_words->len) { \
        word = &g_array_index (part->normalized_words, rspamd_stat_token_t, (i)); \
    if ((i) < part->utf_words->len) { \
        word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \
        sd->t.begin = word->begin; \
        sd->t.len = word->len; \
    } \
@@ -936,8 +936,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L)
 		/* Calculate direct hash */
 		rspamd_cryptobox_hash_init (&st, key, rspamd_cryptobox_HASHKEYBYTES);

 		for (i = 0; i < part->normalized_words->len; i ++) {
 			word = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
 		for (i = 0; i < part->utf_words->len; i ++) {
 			word = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
 			rspamd_cryptobox_hash_update (&st, word->begin, word->len);
 		}

@@ -947,7 +947,7 @@ lua_textpart_get_fuzzy_hashes (lua_State * L)
 				sizeof (hexdigest));
 		lua_pushlstring (L, hexdigest, sizeof (hexdigest) - 1);

 		sgl = rspamd_shingles_from_text (part->normalized_words, key,
 		sgl = rspamd_shingles_from_text (part->utf_words, key,
 				pool, lua_shingles_filter, part, RSPAMD_SHINGLES_MUMHASH);

 		if (sgl == NULL) {
--- a/src/lua/lua_trie.c
+++ b/src/lua/lua_trie.c
@@ -262,9 +262,9 @@ lua_trie_search_mime (lua_State *L)
 		for (i = 0; i < task->text_parts->len; i ++) {
 			part = g_ptr_array_index (task->text_parts, i);

 			if (!IS_PART_EMPTY (part) && part->content != NULL) {
 				text = part->content->data;
 				len = part->content->len;
 			if (!IS_PART_EMPTY (part) && part->utf_content != NULL) {
 				text = part->utf_content->data;
 				len = part->utf_content->len;

 				if (lua_trie_search_str (L, trie, text, len) != 0) {
 					found = TRUE;
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -560,13 +560,13 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 	guint i, ncap = 0;
 	gdouble cur_score = 0.0;

 	if (part == NULL || part->normalized_words == NULL ||
 			part->normalized_words->len == 0) {
 	if (part == NULL || part->utf_words == NULL ||
 			part->utf_words->len == 0) {
 		return;
 	}

 	for (i = 0; i < part->normalized_words->len; i++) {
 		w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
 	for (i = 0; i < part->utf_words->len; i++) {
 		w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);

 		if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {

@@ -588,7 +588,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
 	 */
 	part->capital_letters += ncap;

 	cur_score /= (gdouble)part->normalized_words->len;
 	cur_score /= (gdouble)part->utf_words->len;

 	if (cur_score > 2.0) {
 		cur_score = 2.0;
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -1196,7 +1196,7 @@ fuzzy_io_fin (void *ud)
 static GArray *
 fuzzy_preprocess_words (struct rspamd_mime_text_part *part, rspamd_mempool_t *pool)
 {
 	return part->normalized_words;
 	return part->utf_words;
 }

 static void
@@ -1418,8 +1418,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task,
 			rspamd_cryptobox_hash_init (&st, rule->hash_key->str,
 					rule->hash_key->len);

 			rspamd_cryptobox_hash_update (&st, part->stripped_content->data,
 					part->stripped_content->len);
 			rspamd_cryptobox_hash_update (&st, part->utf_stripped_content->data,
 					part->utf_stripped_content->len);

 			if (task->subject) {
 				/* We also include subject */
@@ -2615,7 +2615,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
 			}

 			/* Check length of part */
 			fac = rule->ctx->text_multiplier * part->content->len;
 			fac = rule->ctx->text_multiplier * part->utf_content->len;
 			if ((double)min_bytes > fac) {
 				if (!rule->short_text_direct_hash) {
 					msg_info_task (
@@ -2624,7 +2624,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
 									"skip fuzzy check",
 							task->message_id, min_bytes,
 							fac,
 							part->content->len,
 							part->utf_content->len,
 							rule->ctx->text_multiplier);
 					continue;
 				}
@@ -2635,21 +2635,21 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
 									"use direct hash",
 							task->message_id, min_bytes,
 							fac,
 							part->content->len,
 							part->utf_content->len,
 							rule->ctx->text_multiplier);
 					short_text = TRUE;
 				}
 			}

 			if (part->normalized_words == NULL ||
 					part->normalized_words->len == 0) {
 			if (part->utf_words == NULL ||
 					part->utf_words->len == 0) {
 				msg_info_task ("<%s>, part hash empty, skip fuzzy check",
 						task->message_id);
 				continue;
 			}

 			if (rule->ctx->min_hash_len != 0 &&
 					part->normalized_words->len <
 					part->utf_words->len <
 							rule->ctx->min_hash_len) {
 				if (!rule->short_text_direct_hash) {
 					msg_info_task (