From 9ebb11d00a678e97f14b4cd33a68c3e5cd385392 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 8 Nov 2018 09:41:19 +0000 Subject: [PATCH] [Minor] Move subject tokenisation to a separate routine Issue: #2623 --- src/libstat/stat_process.c | 27 +---------- src/libstat/tokenizers/tokenizers.c | 70 +++++++++++++++++++++++++++-- src/libstat/tokenizers/tokenizers.h | 2 + src/plugins/chartable.c | 19 +------- 4 files changed, 71 insertions(+), 47 deletions(-) diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 2641226d5..228b6b87b 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -313,10 +313,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, { struct rspamd_mime_text_part *part; rspamd_cryptobox_hash_state_t hst; - rspamd_stat_token_t *tok; rspamd_token_t *st_tok; GArray *words; - gchar *sub = NULL; guint i, reserved_len = 0; gdouble *pdiff; guchar hout[rspamd_cryptobox_HASHBYTES]; @@ -361,29 +359,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, } if (task->subject != NULL) { - sub = task->subject; - } - - if (sub != NULL) { - UText utxt = UTEXT_INITIALIZER; - UErrorCode uc_err = U_ZERO_ERROR; - gsize slen = strlen (sub); - - utext_openUTF8 (&utxt, - sub, - slen, - &uc_err); - - words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF, - NULL, NULL, NULL); - + words = rspamd_tokenize_subject (task); if (words != NULL) { - - for (i = 0; i < words->len; i ++) { - tok = &g_array_index (words, rspamd_stat_token_t, i); - tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; - } - st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, words, @@ -394,8 +371,6 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, rspamd_mempool_add_destructor (task->task_pool, rspamd_array_free_hard, words); } - - utext_close (&utxt); } rspamd_stat_tokenize_parts_metadata (st_ctx, task); diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index c8e8e44df..2ef5c08fb 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -463,6 +463,70 @@ start_over: #undef SHIFT_EX -/* - * vi:ts=4 - */ +GArray * +rspamd_tokenize_subject (struct rspamd_task *task) +{ + UText utxt = UTEXT_INITIALIZER; + UErrorCode uc_err = U_ZERO_ERROR; + gsize slen; + gboolean valid_utf = TRUE; + GArray *words = NULL; + guint i = 0; + gint32 uc; + rspamd_stat_token_t *tok; + + if (task->subject) { + const gchar *p = task->subject; + + slen = strlen (task->subject); + + while (i < slen) { + U8_NEXT (p, i, slen, uc); + + if (((gint32) uc) < 0) { + valid_utf = FALSE; + break; + } +#if U_ICU_VERSION_MAJOR_NUM < 50 + if (u_isalpha (uc)) { + gint32 sc = ublock_getCode (uc); + + if (sc == UBLOCK_THAI) { + valid_utf = FALSE; + msg_info_task ("enable workaround for Thai characters for old libicu"); + break; + } + } +#endif + } + + if (valid_utf) { + utext_openUTF8 (&utxt, + task->subject, + slen, + &uc_err); + + words = rspamd_tokenize_text (task->subject, slen, + &utxt, RSPAMD_TOKENIZE_UTF, + task->cfg, NULL, NULL); + + utext_close (&utxt); + } + else { + words = rspamd_tokenize_text (task->subject, slen, + NULL, RSPAMD_TOKENIZE_RAW, + task->cfg, NULL, NULL); + } + } + + if (words != NULL) { + + for (i = 0; i < words->len; i++) { + tok = &g_array_index (words, rspamd_stat_token_t, i); + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT; + } + } + + return words; +} + diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 6c538eafc..bfabde74f 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -57,6 +57,8 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool, struct rspamd_tokenizer_config *cf, gsize *len); + +GArray * rspamd_tokenize_subject (struct rspamd_task *task); #endif /* * vi:ts=4 diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index ce80b2e66..0efbe55ca 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -622,24 +622,9 @@ chartable_symbol_callback (struct rspamd_task *task, if (task->subject != NULL) { GArray *words; rspamd_stat_token_t *w; - guint i; gdouble cur_score = 0.0; - UText utxt = UTEXT_INITIALIZER; - UErrorCode uc_err = U_ZERO_ERROR; - gsize slen = strlen (task->subject); - - utext_openUTF8 (&utxt, - task->subject, - slen, - &uc_err); - - words = rspamd_tokenize_text (task->subject, slen, - &utxt, - RSPAMD_TOKENIZE_UTF, - NULL, - NULL, - NULL); + words = rspamd_tokenize_subject (task); if (words && words->len > 0) { for (i = 0; i < words->len; i++) { @@ -664,8 +649,6 @@ chartable_symbol_callback (struct rspamd_task *task, if (words) { g_array_free (words, TRUE); } - - utext_close (&utxt); } rspamd_symcache_finalize_item (task, item); -- 2.39.5