]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Move subject tokenisation to a separate routine
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 8 Nov 2018 09:41:19 +0000 (09:41 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 8 Nov 2018 09:41:57 +0000 (09:41 +0000)
Issue: #2623

src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/plugins/chartable.c

index 2641226d566b919acb0924e9af8d3072bb98c9ca..228b6b87bc6b969d012a04865759c7fdaae60eb9 100644 (file)
@@ -313,10 +313,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 {
        struct rspamd_mime_text_part *part;
        rspamd_cryptobox_hash_state_t hst;
-       rspamd_stat_token_t *tok;
        rspamd_token_t *st_tok;
        GArray *words;
-       gchar *sub = NULL;
        guint i, reserved_len = 0;
        gdouble *pdiff;
        guchar hout[rspamd_cryptobox_HASHBYTES];
@@ -361,29 +359,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        }
 
        if (task->subject != NULL) {
-               sub = task->subject;
-       }
-
-       if (sub != NULL) {
-               UText utxt = UTEXT_INITIALIZER;
-               UErrorCode uc_err = U_ZERO_ERROR;
-               gsize slen = strlen (sub);
-
-               utext_openUTF8 (&utxt,
-                               sub,
-                               slen,
-                               &uc_err);
-
-               words = rspamd_tokenize_text (sub, slen, &utxt, RSPAMD_TOKENIZE_UTF,
-                               NULL, NULL, NULL);
-
+               words = rspamd_tokenize_subject (task);
                if (words != NULL) {
-
-                       for (i = 0; i < words->len; i ++) {
-                               tok = &g_array_index (words, rspamd_stat_token_t, i);
-                               tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
-                       }
-
                        st_ctx->tokenizer->tokenize_func (st_ctx,
                                        task->task_pool,
                                        words,
@@ -394,8 +371,6 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                        rspamd_mempool_add_destructor (task->task_pool,
                                        rspamd_array_free_hard, words);
                }
-
-               utext_close (&utxt);
        }
 
        rspamd_stat_tokenize_parts_metadata (st_ctx, task);
index c8e8e44dfe47bc12957f736760c15ff68ae7bce6..2ef5c08fb7437d1f7e5490ba8e23261cac5279d7 100644 (file)
@@ -463,6 +463,70 @@ start_over:
 
 #undef SHIFT_EX
 
-/*
- * vi:ts=4
- */
+GArray *
+rspamd_tokenize_subject (struct rspamd_task *task)
+{
+       UText utxt = UTEXT_INITIALIZER;
+       UErrorCode uc_err = U_ZERO_ERROR;
+       gsize slen;
+       gboolean valid_utf = TRUE;
+       GArray *words = NULL;
+       guint i = 0;
+       gint32 uc;
+       rspamd_stat_token_t *tok;
+
+       if (task->subject) {
+               const gchar *p = task->subject;
+
+               slen = strlen (task->subject);
+
+               while (i < slen) {
+                       U8_NEXT (p, i, slen, uc);
+
+                       if (((gint32) uc) < 0) {
+                               valid_utf = FALSE;
+                               break;
+                       }
+#if U_ICU_VERSION_MAJOR_NUM < 50
+                       if (u_isalpha (uc)) {
+                               gint32 sc = ublock_getCode (uc);
+
+                               if (sc == UBLOCK_THAI) {
+                                       valid_utf = FALSE;
+                                       msg_info_task ("enable workaround for Thai characters for old libicu");
+                                       break;
+                               }
+                       }
+#endif
+               }
+
+               if (valid_utf) {
+                       utext_openUTF8 (&utxt,
+                                       task->subject,
+                                       slen,
+                                       &uc_err);
+
+                       words = rspamd_tokenize_text (task->subject, slen,
+                                       &utxt, RSPAMD_TOKENIZE_UTF,
+                                       task->cfg, NULL, NULL);
+
+                       utext_close (&utxt);
+               }
+               else {
+                       words = rspamd_tokenize_text (task->subject, slen,
+                                       NULL, RSPAMD_TOKENIZE_RAW,
+                                       task->cfg, NULL, NULL);
+               }
+       }
+
+       if (words != NULL) {
+
+               for (i = 0; i < words->len; i++) {
+                       tok = &g_array_index (words, rspamd_stat_token_t, i);
+                       tok->flags |= RSPAMD_STAT_TOKEN_FLAG_SUBJECT;
+               }
+       }
+
+       return words;
+}
+
index 6c538eafc049f4556f25d7492a664292f4e96524..bfabde74f761199e886812b5cc12fd4481ab3ba8 100644 (file)
@@ -57,6 +57,8 @@ gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
                struct rspamd_tokenizer_config *cf,
                gsize *len);
 
+
+GArray * rspamd_tokenize_subject (struct rspamd_task *task);
 #endif
 /*
  * vi:ts=4
index ce80b2e66224a5ee0f5b8d24536ccad8bbfab212..0efbe55cacf0c42dd31f9dab75eee01f7d5c115c 100644 (file)
@@ -622,24 +622,9 @@ chartable_symbol_callback (struct rspamd_task *task,
        if (task->subject != NULL) {
                GArray *words;
                rspamd_stat_token_t *w;
-               guint i;
                gdouble cur_score = 0.0;
 
-               UText utxt = UTEXT_INITIALIZER;
-               UErrorCode uc_err = U_ZERO_ERROR;
-               gsize slen = strlen (task->subject);
-
-               utext_openUTF8 (&utxt,
-                               task->subject,
-                               slen,
-                               &uc_err);
-
-               words = rspamd_tokenize_text (task->subject, slen,
-                               &utxt,
-                               RSPAMD_TOKENIZE_UTF,
-                               NULL,
-                               NULL,
-                               NULL);
+               words = rspamd_tokenize_subject (task);
 
                if (words && words->len > 0) {
                        for (i = 0; i < words->len; i++) {
@@ -664,8 +649,6 @@ chartable_symbol_callback (struct rspamd_task *task,
                if (words) {
                        g_array_free (words, TRUE);
                }
-
-               utext_close (&utxt);
        }
 
        rspamd_symcache_finalize_item (task, item);