From: Vsevolod Stakhov <vsevolod@highsecure.ru>
Date: Sat, 25 Feb 2017 13:38:48 +0000 (+0000)
Subject: [Minor] Use libicu for tokenizers
X-Git-Tag: 1.5.0~32
X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=e747ec5652c0ae9310ccd768c333e4dd531ad0da;p=rspamd.git

[Minor] Use libicu for tokenizers
---

diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 72f7a6bb2..da6bbb58d 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -21,6 +21,8 @@
 #include "tokenizers.h"
 #include "stat_internal.h"
 #include "../../../contrib/mumhash/mum.h"
+#include "unicode/utf8.h"
+#include "unicode/uchar.h"
 
 typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
 		rspamd_stat_token_t * token,
@@ -169,10 +171,10 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
 		GList **exceptions, gboolean is_utf, gsize *rl,
 		gboolean check_signature)
 {
-	gsize remain, siglen = 0;
+	gint32 i, siglen = 0, remain;
 	goffset pos;
-	const gchar *p, *next_p, *sig = NULL;
-	gunichar uc;
+	const gchar *p, *s, *sig = NULL;
+	UChar32 uc;
 	guint processed = 0;
 	struct rspamd_process_exception *ex = NULL;
 	enum {
@@ -205,15 +207,20 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
 	}
 
 	remain = buf->len - pos;
-	p = *cur;
-	token->begin = p;
+	s = *cur;
+	token->begin = s;
 
-	while (remain > 0) {
-		uc = g_utf8_get_char (p);
-		next_p = g_utf8_next_char (p);
+	for (i = 0; i < remain; ) {
+		p = &s[i];
+		U8_NEXT (s, i, remain, uc);
 
-		if (next_p - p > (gint)remain) {
-			return FALSE;
+		if (uc < 0) {
+			if (i < remain) {
+				uc = 0xFFFD;
+			}
+			else {
+				return FALSE;
+			}
 		}
 
 		switch (state) {
@@ -228,15 +235,15 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
 				state = skip_exception;
 				continue;
 			}
-			else if (g_unichar_isgraph (uc)) {
-				if (!g_unichar_ispunct (uc)) {
+			else if (u_isgraph (uc)) {
+				if (!u_ispunct (uc)) {
 					state = feed_token;
 					token->begin = p;
 					continue;
 				}
 				else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
 					sig = p;
-					siglen = remain;
+					siglen = remain - i;
 					state = process_signature;
 					continue;
 				}
@@ -247,7 +254,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
 				token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
 				goto set_token;
 			}
-			else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
+			else if (!u_isgraph (uc) || u_ispunct (uc)) {
 				token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
 				goto set_token;
 			}
@@ -269,9 +276,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
 			}
 			break;
 		}
-
-		remain -= next_p - p;
-		p = next_p;
 	}
 
 set_token:
@@ -284,7 +288,7 @@ set_token:
 		g_assert (token->len > 0);
 	}
 
-	*cur = p;
+	*cur = &s[i];
 
 	return TRUE;
 }