From 0d53332a7ecaa3a2b5020c7c58d6146d72d7b05c Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Sun, 25 Nov 2018 12:00:24 +0000
Subject: [Project] Various unicode fixes in language detector

---
 src/libmime/lang_detection.c        | 58 ++++++++++++-------------------------
 src/libmime/lang_detection.h        | 11 -------
 src/libmime/message.c               |  1 -
 src/libmime/message.h               |  4 ---
 src/libserver/task.c                |  3 --
 src/libstat/tokenizers/tokenizers.c |  5 ++--
 6 files changed, 20 insertions(+), 62 deletions(-)

diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index b2a2f1f6c..dfcbb527a 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -24,6 +24,7 @@
 
 #include <glob.h>
 #include <unicode/utf8.h>
+#include <unicode/utf16.h>
 #include <unicode/ucnv.h>
 #include <unicode/uchar.h>
 #include <unicode/ustring.h>
@@ -873,31 +874,6 @@ end:
 	return ret;
 }
 
-
-void
-rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
-		rspamd_mempool_t *pool,
-		rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token)
-{
-	UChar *out;
-	int32_t nsym;
-	UErrorCode uc_err = U_ZERO_ERROR;
-
-	ucs_token->flags = utf_token->flags;
-	out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->normalized.len + 1));
-	nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->normalized.len + 1),
-			utf_token->normalized.begin, utf_token->normalized.len, &uc_err);
-
-	if (nsym >= 0 && uc_err == U_ZERO_ERROR) {
-		rspamd_language_detector_ucs_lowercase (out, nsym);
-		ucs_token->normalized.begin = (const gchar *) out;
-		ucs_token->normalized.len = nsym;
-	}
-	else {
-		ucs_token->normalized.len = 0;
-	}
-}
-
 static void
 rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
 		goffset *offsets_out)
@@ -905,6 +881,7 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
 	guint step_len, remainder, i, out_idx;
 	guint64 coin, sel;
 	rspamd_stat_token_t *tok;
+	UChar32 first, last;
 
 	g_assert (nwords != 0);
 	g_assert (offsets_out != NULL);
@@ -942,11 +919,17 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
 		for (;;) {
 			tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
 			/* Filter bad tokens */
-			if (tok->normalized.len >= 2 &&
-				u_isalpha (*(UChar *)tok->normalized.begin) &&
-				u_isalpha (*(((UChar *)tok->normalized.begin) + (tok->normalized.len - 1)))) {
-				offsets_out[out_idx] = sel;
-				break;
+
+			if (tok->normalized.len >= 2) {
+				U16_GET_OR_FFFD (tok->normalized.begin, 0, 0, tok->normalized.len,
+						first);
+				U16_GET_OR_FFFD (tok->normalized.begin, 0, tok->normalized.len - 1,
+						tok->normalized.len,
+						last);
+				if (u_isalpha (first) && u_isalpha (last)) {
+					offsets_out[out_idx] = sel;
+					break;
+				}
 			}
 			else {
 				ntries ++;
@@ -966,8 +949,6 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
 		}
 	}
 
-
-
 	/*
 	 * Fisher-Yates algorithm:
 	 * for i from 0 to n−2 do
@@ -1001,13 +982,13 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
 			window[0] = (UChar)' ';
 
 			for (i = 0; i < wlen - 1; i ++) {
-				window[i + 1] = *(((UChar *)tok->normalized.begin) + i);
+				window[i + 1] = tok->unicode.begin[i];
 			}
 		}
 		else if (cur_off + wlen == tok->normalized.len + 1) {
 			/* Add trailing space */
 			for (i = 0; i < wlen - 1; i ++) {
-				window[i] = *(((UChar *)tok->normalized.begin) + cur_off + i);
+				window[i] = tok->unicode.begin[cur_off + i];
 			}
 			window[wlen - 1] = (UChar)' ';
 		}
@@ -1018,7 +999,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
 		else {
 			/* Normal case */
 			for (i = 0; i < wlen; i++) {
-				window[i] = *(((UChar *) tok->normalized.begin) + cur_off + i);
+				window[i] = tok->unicode.begin[cur_off + i];
 			}
 		}
 	}
@@ -1027,7 +1008,7 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
 			return -1;
 		}
 
-		window[0] = *(((UChar *)tok->normalized.begin) + cur_off);
+		window[0] = tok->unicode.begin[cur_off];
 	}
 
 	return cur_off + 1;
@@ -1200,10 +1181,7 @@ rspamd_language_detector_detect_type (struct rspamd_task *task,
 	for (i = 0; i < nparts; i++) {
 		tok = &g_array_index (words, rspamd_stat_token_t,
 				selected_words[i]);
-		rspamd_language_detector_to_ucs (task->lang_det,
-				task->task_pool,
-				tok, &ucs_w);
-		rspamd_language_detector_detect_word (task, d, &ucs_w, candidates,
+		rspamd_language_detector_detect_word (task, d, tok, candidates,
 				d->trigramms[cat]);
 	}
 
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index 204bdf9af..517ab037e 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -62,17 +62,6 @@ struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config
 struct rspamd_lang_detector* rspamd_language_detector_ref (struct rspamd_lang_detector* d);
 void rspamd_language_detector_unref (struct rspamd_lang_detector* d);
 
-/**
- * Convert string from utf8 to ucs32
- * @param d
- * @param utf_token
- * @param ucs_token
- */
-void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
-		rspamd_mempool_t *pool,
-		rspamd_stat_token_t *utf_token,
-		rspamd_stat_token_t *ucs_token);
-
 /**
  * Try to detect language of words
  * @param d
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 7572a4178..4a765643a 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -711,7 +711,6 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
 	if (text_part->utf_raw_content != NULL) {
 		/* Different from HTML, where we also parse HTML and strip tags */
 		text_part->utf_content = text_part->utf_raw_content;
-		text_part->unicode_content = text_part->unicode_raw_content;
 	}
 	else {
 		/*
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 0f5c3dfb7..ed9dfef6e 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -104,10 +104,6 @@ struct rspamd_mime_text_part {
 	GArray *utf_words;
 	UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
 
-	/* Unicode content, used by libicu */
-	GArray *unicode_raw_content; /* unicode raw content (of UChar) */
-	GArray *unicode_content; /* unicode processed content (of UChar) */
-
 	GPtrArray *newlines;	/**< positions of newlines in text, relative to content*/
 	struct html_content *html;
 	GList *exceptions;	/**< list of offsets of urls						*/
diff --git a/src/libserver/task.c b/src/libserver/task.c
index de2745701..6135bced4 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -267,9 +267,6 @@ rspamd_task_free (struct rspamd_task *task)
 			if (tp->languages) {
 				g_ptr_array_unref (tp->languages);
 			}
-			if (tp->unicode_raw_content) {
-				g_array_free (tp->unicode_raw_content, TRUE);
-			}
 		}
 
 		if (task->rcpt_envelope) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 9bbe899fb..d27d9bc58 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -271,9 +271,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 	buf.original.begin = text;
 	buf.original.len = len;
 	buf.flags = 0;
-	token.original.begin = NULL;
-	token.original.len = 0;
-	token.flags = 0;
+
+	memset (&token, 0, sizeof (token));
 
 	if (cfg != NULL) {
 		min_len = cfg->min_word_len;
-- 
cgit v1.2.3