summaryrefslogtreecommitdiffstats
path: root/src/libstat
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-02-25 13:38:48 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-02-25 13:38:48 +0000
commite747ec5652c0ae9310ccd768c333e4dd531ad0da (patch)
tree9f21067c48db4040b46e31c953d1485fd9cd8723 /src/libstat
parentf8cb95972dc346ad653066c2a4fd914b932991d3 (diff)
downloadrspamd-e747ec5652c0ae9310ccd768c333e4dd531ad0da.tar.gz
rspamd-e747ec5652c0ae9310ccd768c333e4dd531ad0da.zip
[Minor] Use libicu for tokenizers
Diffstat (limited to 'src/libstat')
-rw-r--r--src/libstat/tokenizers/tokenizers.c40
1 files changed, 22 insertions, 18 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 72f7a6bb2..da6bbb58d 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -21,6 +21,8 @@
#include "tokenizers.h"
#include "stat_internal.h"
#include "../../../contrib/mumhash/mum.h"
+#include "unicode/utf8.h"
+#include "unicode/uchar.h"
typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
rspamd_stat_token_t * token,
@@ -169,10 +171,10 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
GList **exceptions, gboolean is_utf, gsize *rl,
gboolean check_signature)
{
- gsize remain, siglen = 0;
+ gint32 i, siglen = 0, remain;
goffset pos;
- const gchar *p, *next_p, *sig = NULL;
- gunichar uc;
+ const gchar *p, *s, *sig = NULL;
+ UChar32 uc;
guint processed = 0;
struct rspamd_process_exception *ex = NULL;
enum {
@@ -205,15 +207,20 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
}
remain = buf->len - pos;
- p = *cur;
- token->begin = p;
+ s = *cur;
+ token->begin = s;
- while (remain > 0) {
- uc = g_utf8_get_char (p);
- next_p = g_utf8_next_char (p);
+ for (i = 0; i < remain; ) {
+ p = &s[i];
+ U8_NEXT (s, i, remain, uc);
- if (next_p - p > (gint)remain) {
- return FALSE;
+ if (uc < 0) {
+ if (i < remain) {
+ uc = 0xFFFD;
+ }
+ else {
+ return FALSE;
+ }
}
switch (state) {
@@ -228,15 +235,15 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
state = skip_exception;
continue;
}
- else if (g_unichar_isgraph (uc)) {
- if (!g_unichar_ispunct (uc)) {
+ else if (u_isgraph (uc)) {
+ if (!u_ispunct (uc)) {
state = feed_token;
token->begin = p;
continue;
}
else if (check_signature && pos != 0 && (*p == '_' || *p == '-')) {
sig = p;
- siglen = remain;
+ siglen = remain - i;
state = process_signature;
continue;
}
@@ -247,7 +254,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
goto set_token;
}
- else if (!g_unichar_isgraph (uc) || g_unichar_ispunct (uc)) {
+ else if (!u_isgraph (uc) || u_ispunct (uc)) {
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
goto set_token;
}
@@ -269,9 +276,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
}
break;
}
-
- remain -= next_p - p;
- p = next_p;
}
set_token:
@@ -284,7 +288,7 @@ set_token:
g_assert (token->len > 0);
}
- *cur = p;
+ *cur = &s[i];
return TRUE;
}