aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers/tokenizers.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-08 09:41:19 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-08 09:41:57 +0000
commit9ebb11d00a678e97f14b4cd33a68c3e5cd385392 (patch)
tree2a701df0af31d895766325f8d5d759eec7a50b47 /src/libstat/tokenizers/tokenizers.c
parentad33efe0d2a35eee8a3a47a6fdffcae6d6b8e185 (diff)
downloadrspamd-9ebb11d00a678e97f14b4cd33a68c3e5cd385392.tar.gz
rspamd-9ebb11d00a678e97f14b4cd33a68c3e5cd385392.zip
[Minor] Move subject tokenisation to a separate routine
Issue: #2623
Diffstat (limited to 'src/libstat/tokenizers/tokenizers.c')
-rw-r--r--src/libstat/tokenizers/tokenizers.c70
1 files changed, 67 insertions, 3 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index c8e8e44df..2ef5c08fb 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -463,6 +463,70 @@ start_over:
#undef SHIFT_EX
-/*
- * vi:ts=4
- */
+GArray *
+rspamd_tokenize_subject (struct rspamd_task *task)
+{
+ UText utxt = UTEXT_INITIALIZER;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gsize slen;
+ gboolean valid_utf = TRUE;
+ GArray *words = NULL;
+ guint i = 0;
+ gint32 uc;
+ rspamd_stat_token_t *tok;
+
+ if (task->subject) {
+ const gchar *p = task->subject;
+
+ slen = strlen (task->subject);
+
+ while (i < slen) {
+ U8_NEXT (p, i, slen, uc);
+
+ if (((gint32) uc) < 0) {
+ valid_utf = FALSE;
+ break;
+ }
+#if U_ICU_VERSION_MAJOR_NUM < 50
+ if (u_isalpha (uc)) {
+ gint32 sc = ublock_getCode (uc);
+
+ if (sc == UBLOCK_THAI) {
+ valid_utf = FALSE;
+ msg_info_task ("enable workaround for Thai characters for old libicu");
+ break;
+ }
+ }
+#endif
+ }
+
+ if (valid_utf) {
+ utext_openUTF8 (&utxt,
+ task->subject,
+ slen,
+ &uc_err);
+
+ words = rspamd_tokenize_text (task->subject, slen,
+ &utxt, RSPAMD_TOKENIZE_UTF,
+ task->cfg, NULL, NULL);
+
+ utext_close (&utxt);
+ }
+ else {
+ words = rspamd_tokenize_text (task->subject, slen,
+ NULL, RSPAMD_TOKENIZE_RAW,
+ task->cfg, NULL, NULL);
+ }
+ }
+
+ if (words != NULL) {
+
+ for (i = 0; i < words->len; i++) {
+ tok = &g_array_index (words, rspamd_stat_token_t, i);
+ tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+ }
+ }
+
+ return words;
+}
+