From c31f8bf12bff61c9422de9eeff0292c6ac339c5e Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Thu, 6 Sep 2018 19:49:44 +0100
Subject: [PATCH] [Feature] Implement new text tokenizer based on libicu

---
 src/libmime/message.c               |  23 +-
 src/libstat/stat_process.c          |  12 +-
 src/libstat/tokenizers/tokenizers.c | 418 ++++++++++++++--------------
 src/libstat/tokenizers/tokenizers.h |   3 +
 src/lua/lua_util.c                  |  11 +-
 src/plugins/chartable.c             |  12 +-
 6 files changed, 258 insertions(+), 221 deletions(-)

diff --git a/src/libmime/message.c b/src/libmime/message.c
index e59d34b25..4ec021843 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -203,21 +203,14 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
 		tok_type = RSPAMD_TOKENIZE_RAW;
 	}
 
-	/* Ugly workaround */
-	if (IS_PART_HTML (part)) {
-		part->utf_words = rspamd_tokenize_text (
-				part->utf_stripped_content->data,
-				part->utf_stripped_content->len, tok_type, task->cfg,
-				part->exceptions,
-				NULL);
-	}
-	else {
-		part->utf_words = rspamd_tokenize_text (
-				part->utf_stripped_content->data,
-				part->utf_stripped_content->len, tok_type, task->cfg,
-				part->exceptions,
-				NULL);
-	}
+	part->utf_words = rspamd_tokenize_text (
+			part->utf_stripped_content->data,
+			part->utf_stripped_content->len,
+			&part->utf_stripped_text,
+			tok_type, task->cfg,
+			part->exceptions,
+			NULL);
+
 
 	if (part->utf_words) {
 		part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c
index 394173444..6d34ba51c 100644
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -365,8 +365,18 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 	}
 
 	if (sub != NULL) {
-		words = rspamd_tokenize_text (sub, strlen (sub), RSPAMD_TOKENIZE_UTF,
+		UText utxt = UTEXT_INITIALIZER;
+		UErrorCode uc_err = U_ZERO_ERROR;
+		gsize slen = strlen (sub);
+
+		utext_openUTF8 (&utxt,
+				sub,
+				slen,
+				&uc_err);
+
+		words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
 				NULL, NULL, NULL);
+
 		if (words != NULL) {
 
 			for (i = 0; i < words->len; i ++) {
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 5436430fe..9babfc8a1 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -21,8 +21,10 @@
 #include "tokenizers.h"
 #include "stat_internal.h"
 #include "../../../contrib/mumhash/mum.h"
-#include "unicode/utf8.h"
-#include "unicode/uchar.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+#include <unicode/uiter.h>
+#include <unicode/ubrk.h>
 
 typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
 		rspamd_stat_token_t * token,
@@ -148,187 +150,88 @@ rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
 	return TRUE;
 }
 
-static gboolean
-rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
-		gchar const **cur, rspamd_stat_token_t * token,
-		GList **exceptions, gsize *rl,
-		gboolean check_signature)
+static inline gboolean
+rspamd_tokenize_check_limit (gboolean decay,
+							 guint word_decay,
+							 guint nwords,
+							 guint64 *hv,
+							 guint64 *prob,
+							 const rspamd_stat_token_t *token,
+							 gssize remain,
+							 gssize total)
 {
-	gint32 i, siglen = 0, remain;
-	goffset pos;
-	const gchar *p, *s, *sig = NULL;
-	UChar32 uc;
-	guint processed = 0;
-	struct rspamd_process_exception *ex = NULL;
-	enum {
-		skip_delimiters = 0,
-		feed_token,
-		process_signature
-	} state = skip_delimiters;
-
-	if (buf == NULL) {
-		return FALSE;
-	}
-
-	if (exceptions != NULL && *exceptions != NULL) {
-		ex = (*exceptions)->data;
-	}
-
-	g_assert (cur != NULL);
-
-	if (*cur == NULL) {
-		*cur = buf->begin;
-	}
+	static const gdouble avg_word_len = 6.0;
 
-	token->len = 0;
+	if (!decay) {
+		if (token->len >= sizeof (guint64)) {
+#ifdef _MUM_UNALIGNED_ACCESS
+			*hv = mum_hash_step (*hv, *(guint64 *)token->begin);
+#else
+			guint64 tmp;
+			memcpy (&tmp, token->begin, sizeof (tmp));
+			*hv = mum_hash_step (*hv, tmp);
+#endif
+		}
 
-	pos = *cur - buf->begin;
-	if (pos >= buf->len) {
-		return FALSE;
-	}
+		/* Check for decay */
+		if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
+			/* Start decay */
+			gdouble decay_prob;
 
-	remain = buf->len - pos;
-	s = *cur;
-	p = s;
-	token->begin = s;
+			*hv = mum_hash_finish (*hv);
 
-	for (i = 0; i < remain; ) {
-		p = &s[i];
-		U8_NEXT (s, i, remain, uc); /* This also advances i */
+			/* We assume that word is 6 symbols length in average */
+			decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len);
 
-		if (uc < 0) {
-			if (i < remain) {
-				uc = 0xFFFD;
+			if (decay_prob >= 1.0) {
+				*prob = G_MAXUINT64;
 			}
 			else {
-				return FALSE;
+				*prob = decay_prob * G_MAXUINT64;
 			}
-		}
 
-		switch (state) {
-		case skip_delimiters:
-			if (ex != NULL && p - buf->begin == ex->pos) {
-				goto process_exception;
-			}
-			else if (u_isgraph (uc)) {
-				if (u_isalnum (uc)) {
-					state = feed_token;
-					token->begin = p;
-					continue;
-				}
-				else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
-					sig = p;
-					siglen = remain - i;
-					state = process_signature;
-					continue;
-				}
-			}
-			break;
-		case feed_token:
-			if (ex != NULL && p - buf->begin == (gint)ex->pos) {
-				token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
-				goto process_exception;
-			}
-			else if (!u_isalnum (uc)) {
-				token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
-				goto set_token;
-			}
-			processed ++;
-			break;
-		case process_signature:
-			if (*p == '\r' || *p == '\n') {
-				msg_debug ("signature found: %*s", (gint)siglen, sig);
-				return FALSE;
-			}
-			else if (*p != ' ' && *p != '-' && *p != '_') {
-				state = skip_delimiters;
-				continue;
-			}
-			break;
+			return TRUE;
 		}
 	}
+	else {
+		/* Decaying probability */
+		/* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
+		*hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
 
-	/* Last character */
-	if (state == feed_token) {
-		p = &s[i];
-		goto set_token;
+		if (*hv > *prob) {
+			return TRUE;
+		}
 	}
 
 	return FALSE;
+}
 
-set_token:
-	if (rl) {
-		*rl = processed;
-	}
+static inline gboolean
+rspamd_utf_word_valid (const gchar *text, const gchar *end,
+		gint32 start, gint32 finish)
+{
+	const gchar *st = text + start, *fin = text + finish;
+	UChar32 c;
 
-	if (token->len == 0 && processed > 0) {
-		token->len = p - token->begin;
-		g_assert (token->len > 0);
+	if (st >= end || fin > end || st >= fin) {
+		return FALSE;
 	}
 
-	*cur = &s[i];
-
-	return TRUE;
-
-process_exception:
-	if (token->len == 0 && processed > 0) {
-		/*
-		 * We have processed something before the next exception, so
-		 * continue processing on next iteration of this function call
-		 */
-		token->len = p - token->begin;
-		g_assert (token->len > 0);
-
-		*cur = p;
+	U8_NEXT (text, start, finish, c);
 
+	if (u_isalnum (c)) {
 		return TRUE;
 	}
 
-	if (ex->type == RSPAMD_EXCEPTION_URL) {
-		token->begin = "!!EX!!";
-		token->len = sizeof ("!!EX!!") - 1;
-		token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-		processed = token->len;
-	}
-
-	p += ex->len;
-
-	/* We need to skip all exceptions that are within this exception */
-	*exceptions = g_list_next (*exceptions);
-
-	while (*exceptions) {
-		ex = (*exceptions)->data;
-
-		if (ex->pos < p - buf->begin) {
-			/* Nested exception */
-			if (ex->pos + ex->len > p - buf->begin) {
-				/*
-				 * We have somehow overlapping nesting exception,
-				 * extend current offset
-				 */
-				p = buf->begin + ex->pos + ex->len;
-			}
-
-			*exceptions = g_list_next (*exceptions);
-		}
-		else {
-			break;
-		}
-	}
-
-	*cur = p;
-
-	if (rl) {
-		*rl = processed;
-	}
-
-	return TRUE;
+	return FALSE;
 }
 
 GArray *
 rspamd_tokenize_text (const gchar *text, gsize len,
+					  const UText *utxt,
 					  enum rspamd_tokenize_type how,
-					  struct rspamd_config *cfg, GList *exceptions,
+					  struct rspamd_config *cfg,
+					  GList *exceptions,
 					  guint64 *hash)
 {
 	rspamd_stat_token_t token, buf;
@@ -336,11 +239,11 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 	gsize l = 0;
 	GArray *res;
 	GList *cur = exceptions;
-	token_get_function func;
 	guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
 	guint64 hv = 0;
 	gboolean decay = FALSE;
 	guint64 prob;
+	static UBreakIterator* bi = NULL;
 
 	if (text == NULL) {
 		return NULL;
@@ -353,18 +256,6 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 	token.len = 0;
 	token.flags = 0;
 
-	switch (how) {
-	case RSPAMD_TOKENIZE_RAW:
-		func = rspamd_tokenizer_get_word_raw;
-		break;
-	case RSPAMD_TOKENIZE_UTF:
-		func = rspamd_tokenizer_get_word_utf8;
-		break;
-	default:
-		g_assert_not_reached ();
-		break;
-	}
-
 	if (cfg != NULL) {
 		min_len = cfg->min_word_len;
 		max_len = cfg->max_word_len;
@@ -375,56 +266,177 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 	res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
 			initial_size);
 
-	while (func (&buf, &pos, &token, &cur, &l, FALSE)) {
-		if (l == 0 || (min_len > 0 && l < min_len) ||
-					(max_len > 0 && l > max_len)) {
+	if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
+		while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
+			if (l == 0 || (min_len > 0 && l < min_len) ||
+				(max_len > 0 && l > max_len)) {
+				token.begin = pos;
+				continue;
+			}
+
+			if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+					&hv, &prob, &token, pos - text, len)) {
+				if (!decay) {
+					decay = TRUE;
+				} else {
+					token.begin = pos;
+					continue;
+				}
+			}
+
+			g_array_append_val (res, token);
 			token.begin = pos;
-			continue;
 		}
+	}
+	else {
+		/* UTF8 boundaries */
+		UErrorCode uc_err = U_ZERO_ERROR;
+		int32_t last, p;
+		struct rspamd_process_exception *ex = NULL;
 
-		if (!decay) {
-			if (token.len >= sizeof (guint64)) {
-#ifdef _MUM_UNALIGNED_ACCESS
-				hv = mum_hash_step (hv, *(guint64 *)token.begin);
-#else
-				guint64 tmp;
-				memcpy (&tmp, token.begin, sizeof (tmp));
-				hv = mum_hash_step (hv, tmp);
-#endif
-			}
+		if (bi == NULL) {
+			bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
 
-			/* Check for decay */
-			if (word_decay > 0 && res->len > word_decay && pos - text < (gssize)len) {
-				/* Start decay */
-				gdouble decay_prob;
+			g_assert (U_SUCCESS (uc_err));
+		}
 
-				decay = TRUE;
-				hv = mum_hash_finish (hv);
+		ubrk_setUText (bi, (UText*)utxt, &uc_err);
+		last = ubrk_first (bi);
+		p = last;
 
-				/* We assume that word is 6 symbols length in average */
-				decay_prob = (gdouble)word_decay / ((len - (pos - text)) / 6.0);
+		if (cur) {
+			ex = (struct rspamd_process_exception *)cur->data;
+		}
 
-				if (decay_prob >= 1.0) {
-					prob = G_MAXUINT64;
+		while (p != UBRK_DONE) {
+start_over:
+			token.len = 0;
+
+			if (p > last) {
+				if (ex && cur) {
+					/* Check exception */
+					if (ex->pos >= last && ex->pos <= p) {
+						/* We have an exception within boundary */
+						/* First, start to drain exceptions from the start */
+						while (cur && ex->pos <= last) {
+							/* We have an exception at the beginning, skip those */
+							last += ex->len;
+
+							if (last > p) {
+								/* Exception spread over the boundaries */
+								while (last > p && p != UBRK_DONE) {
+									p = ubrk_next (bi);
+								}
+
+								/* We need to reset our scan with new p and last */
+								goto start_over;
+							}
+
+							if (ex->type == RSPAMD_EXCEPTION_URL) {
+								token.begin = "!!EX!!";
+								token.len = sizeof ("!!EX!!") - 1;
+								token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+								g_array_append_val (res, token);
+								token.flags = 0;
+							}
+
+							cur = g_list_next (cur);
+
+							if (cur) {
+								ex = (struct rspamd_process_exception *) cur->data;
+							}
+						}
+
+						/* Now, we can have an exception within boundary again */
+						if (cur && ex->pos >= last && ex->pos <= p) {
+							/* Append the first part */
+							if (rspamd_utf_word_valid (text, text + len, last,
+									ex->pos)) {
+								token.begin = text + last;
+								token.len = ex->pos - last;
+								token.flags = 0;
+								g_array_append_val (res, token);
+							}
+
+							/* Process the current exception */
+							last += ex->len + token.len;
+
+							if (ex->type == RSPAMD_EXCEPTION_URL) {
+								token.begin = "!!EX!!";
+								token.len = sizeof ("!!EX!!") - 1;
+								token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+
+								g_array_append_val (res, token);
+							}
+
+							if (last > p) {
+								/* Exception spread over the boundaries */
+								while (last > p && p != UBRK_DONE) {
+									p = ubrk_next (bi);
+								}
+								/* We need to reset our scan with new p and last */
+								goto start_over;
+							}
+						}
+						else if (p > last) {
+							if (rspamd_utf_word_valid (text, text + len, last, p)) {
+								token.begin = text + last;
+								token.len = p - last;
+								token.flags = 0;
+							}
+						}
+					}
+					else if (ex->pos < last) {
+						/* Forward exceptions list */
+						while (cur && ex->pos <= last) {
+							/* We have an exception at the beginning, skip those */
+							cur = g_list_next (cur);
+
+							if (cur) {
+								ex = (struct rspamd_process_exception *) cur->data;
+							}
+						}
+
+						if (rspamd_utf_word_valid (text, text + len, last, p)) {
+							token.begin = text + last;
+							token.len = p - last;
+							token.flags = 0;
+						}
+					}
+					else {
+						/* No exceptions within boundary */
+						if (rspamd_utf_word_valid (text, text + len, last, p)) {
+							token.begin = text + last;
+							token.len = p - last;
+							token.flags = 0;
+						}
+					}
 				}
 				else {
-					prob = decay_prob * G_MAXUINT64;
+					if (rspamd_utf_word_valid (text, text + len, last, p)) {
+						token.begin = text + last;
+						token.len = p - last;
+					}
+				}
+
+				if (rspamd_tokenize_check_limit (decay, word_decay, res->len,
+						&hv, &prob, &token, pos - text, len)) {
+					if (!decay) {
+						decay = TRUE;
+					} else {
+						token.len = 0;
+					}
 				}
 			}
-		}
-		else {
-			/* Decaying probability */
-			/* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
-			hv = 2862933555777941757ULL * hv + 3037000493ULL;
 
-			if (hv > prob) {
-				token.begin = pos;
-				continue;
+			if (token.len > 0) {
+				g_array_append_val (res, token);
 			}
-		}
 
-		g_array_append_val (res, token);
-		token.begin = pos;
+			last = p;
+			p = ubrk_next (bi);
+		}
 	}
 
 	if (!decay) {
diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h
index 16ab142fd..6c538eafc 100644
--- a/src/libstat/tokenizers/tokenizers.h
+++ b/src/libstat/tokenizers/tokenizers.h
@@ -7,6 +7,8 @@
 #include "rspamd.h"
 #include "stat_api.h"
 
+#include <unicode/utext.h>
+
 #define RSPAMD_DEFAULT_TOKENIZER "osb"
 
 struct rspamd_tokenizer_runtime;
@@ -37,6 +39,7 @@ gint token_node_compare_func (gconstpointer a, gconstpointer b);
 
 /* Tokenize text into array of words (rspamd_stat_token_t type) */
 GArray * rspamd_tokenize_text (const gchar *text, gsize len,
+							   const UText *utxt,
 							   enum rspamd_tokenize_type how,
 							   struct rspamd_config *cfg,
 							   GList *exceptions,
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 3de68e60a..d6095ab52 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -1078,6 +1078,7 @@ lua_util_tokenize_text (lua_State *L)
 	GList *exceptions = NULL, *cur;
 	struct rspamd_lua_text *t;
 	struct rspamd_process_exception *ex;
+	UText utxt = UTEXT_INITIALIZER;
 	GArray *res;
 	rspamd_stat_token_t *w;
 
@@ -1129,7 +1130,15 @@ lua_util_tokenize_text (lua_State *L)
 		exceptions = g_list_reverse (exceptions);
 	}
 
-	res = rspamd_tokenize_text ((gchar *)in, len, RSPAMD_TOKENIZE_UTF, NULL,
+	UErrorCode uc_err = U_ZERO_ERROR;
+	utext_openUTF8 (&utxt,
+			in,
+			len,
+			&uc_err);
+
+	res = rspamd_tokenize_text ((gchar *)in, len,
+			&utxt,
+			RSPAMD_TOKENIZE_UTF, NULL,
 			exceptions,
 			NULL);
 
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 3c7157311..f917c26c8 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -619,7 +619,17 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
 		guint i;
 		gdouble cur_score = 0.0;
 
-		words = rspamd_tokenize_text (task->subject, strlen (task->subject),
+		UText utxt = UTEXT_INITIALIZER;
+		UErrorCode uc_err = U_ZERO_ERROR;
+		gsize slen = strlen (task->subject);
+
+		utext_openUTF8 (&utxt,
+				task->subject,
+				slen,
+				&uc_err);
+
+		words = rspamd_tokenize_text (task->subject, slen,
+				&utxt,
 				RSPAMD_TOKENIZE_UTF,
 				NULL,
 				NULL,
-- 
2.39.5