diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-08 09:41:19 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-08 09:41:57 +0000 |
commit | 9ebb11d00a678e97f14b4cd33a68c3e5cd385392 (patch) | |
tree | 2a701df0af31d895766325f8d5d759eec7a50b47 /src/libstat/tokenizers/tokenizers.c | |
parent | ad33efe0d2a35eee8a3a47a6fdffcae6d6b8e185 (diff) | |
download | rspamd-9ebb11d00a678e97f14b4cd33a68c3e5cd385392.tar.gz rspamd-9ebb11d00a678e97f14b4cd33a68c3e5cd385392.zip |
[Minor] Move subject tokenisation to a separate routine
Issue: #2623
Diffstat (limited to 'src/libstat/tokenizers/tokenizers.c')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 70 |
1 files changed, 67 insertions, 3 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index c8e8e44df..2ef5c08fb 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -463,6 +463,70 @@ start_over: #undef SHIFT_EX -/* - * vi:ts=4 - */ +GArray * +rspamd_tokenize_subject (struct rspamd_task *task) +{ + UText utxt = UTEXT_INITIALIZER; + UErrorCode uc_err = U_ZERO_ERROR; + gsize slen; + gboolean valid_utf = TRUE; + GArray *words = NULL; + guint i = 0; + gint32 uc; + rspamd_stat_token_t *tok; + + if (task->subject) { + const gchar *p = task->subject; + + slen = strlen (task->subject); + + while (i < slen) { + U8_NEXT (p, i, slen, uc); + + if (((gint32) uc) < 0) { + valid_utf = FALSE; + break; + } +#if U_ICU_VERSION_MAJOR_NUM < 50 + if (u_isalpha (uc)) { + gint32 sc = ublock_getCode (uc); + + if (sc == UBLOCK_THAI) { + valid_utf = FALSE; + msg_info_task ("enable workaround for Thai characters for old libicu"); + break; + } + } +#endif + } + + if (valid_utf) { + utext_openUTF8 (&utxt, + task->subject, + slen, + &uc_err); + + words = rspamd_tokenize_text (task->subject, slen, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL); + + utext_close (&utxt); + } + else { + words = rspamd_tokenize_text (task->subject, slen, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL); + } + } + + if (words != NULL) { + + for (i = 0; i < words->len; i++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; + } + } + + return words; +} + |