]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Rework utf content processing in text parts
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 5 Sep 2018 16:43:20 +0000 (17:43 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 5 Sep 2018 16:44:35 +0000 (17:44 +0100)
- Store unicode in UTF parts
- Store unicode for HTML parts
- Rename struct fields and split them into unicode/utf components

16 files changed:
src/libmime/lang_detection.c
src/libmime/lang_detection.h
src/libmime/message.c
src/libmime/message.h
src/libmime/mime_encoding.c
src/libmime/mime_encoding.h
src/libserver/re_cache.c
src/libserver/task.c
src/libserver/url.c
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/lua/lua_mimepart.c
src/lua/lua_trie.c
src/plugins/chartable.c
src/plugins/fuzzy_check.c

index 8763365af14688be3a00fcf5a310523f9c339cdc..d3c418203570b96ec449f3a887f1290a088fba97 100644 (file)
@@ -1323,7 +1323,7 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
 GPtrArray *
 rspamd_language_detector_detect (struct rspamd_task *task,
                struct rspamd_lang_detector *d,
-               GArray *ucs_tokens, gsize words_len)
+               GArray *ucs_tokens)
 {
        khash_t(rspamd_candidates_hash) *candidates;
        GPtrArray *result;
index 2d28ec65ad17af514327f3a6a09f547000a910bd..2ede46d02e18129681f541227790a1848167cb85 100644 (file)
@@ -61,6 +61,6 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
  */
 GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task,
                struct rspamd_lang_detector *d,
-               GArray *ucs_tokens, gsize words_len);
+               GArray *ucs_tokens);
 
 #endif
index e6cb63504cad4a2d4ead8cb1826050c99d4231bd..1df98075846dc04d4eab4a3342d61f6ba3325754 100644 (file)
@@ -67,7 +67,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
        guint i, nlen, total_len = 0, short_len = 0;
        gdouble avg_len = 0;
 
-       if (part->normalized_words) {
+       if (part->utf_words) {
 #ifdef WITH_SNOWBALL
                static GHashTable *stemmers = NULL;
 
@@ -97,10 +97,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
 #endif
 
 
-               for (i = 0; i < part->normalized_words->len; i++) {
+               for (i = 0; i < part->utf_words->len; i++) {
                        guint64 h;
 
-                       w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+                       w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
                        r = NULL;
 #ifdef WITH_SNOWBALL
                        if (stem) {
@@ -156,7 +156,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
                        }
                }
 
-               if (part->normalized_words && part->normalized_words->len) {
+               if (part->utf_words && part->utf_words->len) {
                        gdouble *avg_len_p, *short_len_p;
 
                        avg_len_p = rspamd_mempool_get_variable (task->task_pool,
@@ -205,41 +205,41 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
 
        /* Ugly workaround */
        if (IS_PART_HTML (part)) {
-               part->normalized_words = rspamd_tokenize_text (
-                               part->stripped_content->data,
-                               part->stripped_content->len, tok_type, task->cfg,
+               part->utf_words = rspamd_tokenize_text (
+                               part->utf_stripped_content->data,
+                               part->utf_stripped_content->len, tok_type, task->cfg,
                                part->exceptions,
                                NULL);
        }
        else {
-               part->normalized_words = rspamd_tokenize_text (
-                               part->stripped_content->data,
-                               part->stripped_content->len, tok_type, task->cfg,
+               part->utf_words = rspamd_tokenize_text (
+                               part->utf_stripped_content->data,
+                               part->utf_stripped_content->len, tok_type, task->cfg,
                                part->exceptions,
                                NULL);
        }
 
-       if (part->normalized_words) {
+       if (part->utf_words) {
                part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
-                               sizeof (guint64), part->normalized_words->len);
+                               sizeof (guint64), part->utf_words->len);
 
                if (IS_PART_UTF (part) && task->lang_det) {
-                       part->ucs32_words = g_array_sized_new (FALSE, FALSE,
-                                       sizeof (rspamd_stat_token_t), part->normalized_words->len);
+                       part->unicode_words = g_array_sized_new (FALSE, FALSE,
+                                       sizeof (rspamd_stat_token_t), part->utf_words->len);
                }
 
-               if (part->ucs32_words) {
+               if (part->unicode_words) {
 
 
-                       for (i = 0; i < part->normalized_words->len; i++) {
-                               w = &g_array_index (part->normalized_words, rspamd_stat_token_t,
+                       for (i = 0; i < part->utf_words->len; i++) {
+                               w = &g_array_index (part->utf_words, rspamd_stat_token_t,
                                                i);
 
                                if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
                                        rspamd_language_detector_to_ucs (task->lang_det,
                                                        task->task_pool,
                                                        w, &ucs_w);
-                                       g_array_append_val (part->ucs32_words, ucs_w);
+                                       g_array_append_val (part->unicode_words, ucs_w);
                                        ucs_len += ucs_w.len;
                                }
                        }
@@ -251,14 +251,14 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
 
 static void
 rspamd_mime_part_detect_language (struct rspamd_task *task,
-               struct rspamd_mime_text_part *part, guint ucs_len)
+               struct rspamd_mime_text_part *part)
 {
        struct rspamd_lang_detector_res *lang;
 
-       if (part->ucs32_words) {
+       if (part->unicode_words) {
                part->languages = rspamd_language_detector_detect (task,
                                task->lang_det,
-                               part->ucs32_words, ucs_len);
+                               part->unicode_words);
 
                if (part->languages->len > 0) {
                        lang = g_ptr_array_index (part->languages, 0);
@@ -289,7 +289,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
                                state = seen_cr;
                                if (p > c) {
                                        last_c = *(p - 1);
-                                       g_byte_array_append (part->stripped_content,
+                                       g_byte_array_append (part->utf_stripped_content,
                                                        (const guint8 *)c, p - c);
                                }
 
@@ -299,11 +299,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
                        case seen_cr:
                                /* Double \r\r */
                                if (!crlf_added) {
-                                       g_byte_array_append (part->stripped_content,
+                                       g_byte_array_append (part->utf_stripped_content,
                                                        (const guint8 *)" ", 1);
                                        crlf_added = TRUE;
                                        g_ptr_array_add (part->newlines,
-                                                       (((gpointer) (goffset) (part->stripped_content->len))));
+                                                       (((gpointer) (goffset) (part->utf_stripped_content->len))));
                                }
 
                                part->nlines ++;
@@ -326,17 +326,17 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 
                                if (p > c) {
                                        last_c = *(p - 1);
-                                       g_byte_array_append (part->stripped_content,
+                                       g_byte_array_append (part->utf_stripped_content,
                                                        (const guint8 *)c, p - c);
                                }
 
                                c = p + 1;
 
                                if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
-                                       g_byte_array_append (part->stripped_content,
+                                       g_byte_array_append (part->utf_stripped_content,
                                                        (const guint8 *)" ", 1);
                                        g_ptr_array_add (part->newlines,
-                                                       (((gpointer) (goffset) (part->stripped_content->len))));
+                                                       (((gpointer) (goffset) (part->utf_stripped_content->len))));
                                        crlf_added = TRUE;
                                }
                                else {
@@ -348,13 +348,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
                                /* \r\n */
                                if (!crlf_added) {
                                        if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) {
-                                               g_byte_array_append (part->stripped_content,
+                                               g_byte_array_append (part->utf_stripped_content,
                                                                (const guint8 *) " ", 1);
                                                crlf_added = TRUE;
                                        }
 
                                        g_ptr_array_add (part->newlines,
-                                                       (((gpointer) (goffset) (part->stripped_content->len))));
+                                                       (((gpointer) (goffset) (part->utf_stripped_content->len))));
                                }
 
                                c = p + 1;
@@ -364,11 +364,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
                        case seen_lf:
                                /* Double \n\n */
                                if (!crlf_added) {
-                                       g_byte_array_append (part->stripped_content,
+                                       g_byte_array_append (part->utf_stripped_content,
                                                        (const guint8 *)" ", 1);
                                        crlf_added = TRUE;
                                        g_ptr_array_add (part->newlines,
-                                                       (((gpointer) (goffset) (part->stripped_content->len))));
+                                                       (((gpointer) (goffset) (part->utf_stripped_content->len))));
                                }
 
                                part->nlines++;
@@ -414,13 +414,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 
                                if (!crlf_added) {
                                        g_ptr_array_add (part->newlines,
-                                                       (((gpointer) (goffset) (part->stripped_content->len))));
+                                                       (((gpointer) (goffset) (part->utf_stripped_content->len))));
                                }
 
                                /* Skip initial spaces */
                                if (G_UNLIKELY (*p == ' ')) {
                                        if (!crlf_added) {
-                                               g_byte_array_append (part->stripped_content,
+                                               g_byte_array_append (part->utf_stripped_content,
                                                                (const guint8 *)" ", 1);
                                        }
 
@@ -451,7 +451,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
 
                switch (state) {
                case normal_char:
-                       g_byte_array_append (part->stripped_content,
+                       g_byte_array_append (part->utf_stripped_content,
                                        (const guint8 *)c, p - c);
 
                        while (c < p) {
@@ -479,10 +479,10 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
                default:
 
                        if (!crlf_added) {
-                               g_byte_array_append (part->stripped_content,
+                               g_byte_array_append (part->utf_stripped_content,
                                                (const guint8 *)" ", 1);
                                g_ptr_array_add (part->newlines,
-                                               (((gpointer) (goffset) (part->stripped_content->len))));
+                                               (((gpointer) (goffset) (part->utf_stripped_content->len))));
                        }
 
                        part->nlines++;
@@ -502,10 +502,10 @@ rspamd_normalize_text_part (struct rspamd_task *task,
        struct rspamd_process_exception *ex;
 
        /* Strip newlines */
-       part->stripped_content = g_byte_array_sized_new (part->content->len);
+       part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len);
        part->newlines = g_ptr_array_sized_new (128);
-       p = (const gchar *)part->content->data;
-       end = p + part->content->len;
+       p = (const gchar *)part->utf_content->data;
+       end = p + part->utf_content->len;
 
        rspamd_strip_newlines_parse (p, end, part);
 
@@ -513,7 +513,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex));
                off = (goffset)g_ptr_array_index (part->newlines, i);
                g_ptr_array_index (part->newlines, i) = (gpointer)(goffset)
-                               (part->stripped_content->data + off);
+                               (part->utf_stripped_content->data + off);
                ex->pos = off;
                ex->len = 0;
                ex->type = RSPAMD_EXCEPTION_NEWLINE;
@@ -522,7 +522,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 
        rspamd_mempool_add_destructor (task->task_pool,
                        (rspamd_mempool_destruct_t) free_byte_array_callback,
-                       part->stripped_content);
+                       part->utf_stripped_content);
        rspamd_mempool_add_destructor (task->task_pool,
                        (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
                        part->newlines);
@@ -615,10 +615,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
                g_assert (rspamd_multipattern_compile (gtube_matcher, NULL));
        }
 
-       if (part->content && part->content->len >= sizeof (gtube_pattern_reject) &&
-                       part->content->len <= max_check_size) {
-               if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->content->data,
-                               part->content->len,
+       if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) &&
+                       part->utf_content->len <= max_check_size) {
+               if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data,
+                               part->utf_content->len,
                                rspamd_multipattern_gtube_cb, NULL, NULL)) > 0) {
 
                        switch (ret) {
@@ -639,7 +639,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
                                msg_info_task (
                                                "<%s>: gtube %s pattern has been found in part of length %ud",
                                                task->message_id, rspamd_action_to_str (act),
-                                               part->content->len);
+                                               part->utf_content->len);
                        }
                }
        }
@@ -655,9 +655,86 @@ exceptions_compare_func (gconstpointer a, gconstpointer b)
        return ea->pos - eb->pos;
 }
 
+static gboolean
+rspamd_message_process_plain_text_part (struct rspamd_task *task,
+                                                                               struct rspamd_mime_text_part *text_part)
+{
+       if (text_part->parsed.len == 0) {
+               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+               return TRUE;
+       }
+
+       rspamd_mime_text_part_maybe_convert (task, text_part);
+
+       if (text_part->utf_raw_content != NULL) {
+               /* Different from HTML, where we also parse HTML and strip tags */
+               text_part->utf_content = text_part->utf_raw_content;
+               text_part->unicode_content = text_part->unicode_raw_content;
+       }
+       else {
+               /*
+                * We ignore unconverted parts from now as it is dangerous
+                * to treat them as text parts
+                */
+
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static gboolean
+rspamd_message_process_html_text_part (struct rspamd_task *task,
+                                                                               struct rspamd_mime_text_part *text_part)
+{
+       text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
+
+       if (text_part->parsed.len == 0) {
+               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+
+               return TRUE;
+       }
+
+       rspamd_mime_text_part_maybe_convert (task, text_part);
+
+       if (text_part->utf_raw_content == NULL) {
+               return FALSE;
+       }
+
+       text_part->html = rspamd_mempool_alloc0 (task->task_pool,
+                       sizeof (*text_part->html));
+       text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
+       text_part->utf_content = rspamd_html_process_part_full (
+                       task->task_pool,
+                       text_part->html,
+                       text_part->utf_raw_content,
+                       &text_part->exceptions,
+                       task->urls,
+                       task->emails);
+
+       if (text_part->utf_content->len == 0) {
+               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
+       }
+
+       /* Also add unicode content */
+       text_part->unicode_content =  g_array_sized_new (FALSE, FALSE,
+                       sizeof (UChar), text_part->utf_content->len + 1);
+       rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content);
+
+       rspamd_mempool_add_destructor (task->task_pool,
+                       (rspamd_mempool_destruct_t) free_byte_array_callback,
+                       text_part->utf_content);
+       rspamd_mempool_add_destructor (task->task_pool,
+                       rspamd_array_free_hard,
+                       text_part->unicode_content);
+
+       return TRUE;
+}
+
 static void
-rspamd_message_process_text_part (struct rspamd_task *task,
-       struct rspamd_mime_part *mime_part)
+rspamd_message_process_text_part_maybe (struct rspamd_task *task,
+                                                                               struct rspamd_mime_part *mime_part)
 {
        struct rspamd_mime_text_part *text_part;
        rspamd_ftok_t html_tok, xhtml_tok;
@@ -738,87 +815,31 @@ rspamd_message_process_text_part (struct rspamd_task *task,
                debug_task ("skip attachments for checking as text parts");
                return;
        }
-
-       if (found_html) {
-               text_part = rspamd_mempool_alloc0 (task->task_pool,
-                               sizeof (struct rspamd_mime_text_part));
-               text_part->raw.begin = mime_part->raw_data.begin;
-               text_part->raw.len = mime_part->raw_data.len;
-               text_part->parsed.begin = mime_part->parsed_data.begin;
-               text_part->parsed.len = mime_part->parsed_data.len;
-               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
-               text_part->mime_part = mime_part;
-
-               if (mime_part->parsed_data.len == 0) {
-                       text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
-                       g_ptr_array_add (task->text_parts, text_part);
-                       return;
-               }
-
-               rspamd_mime_text_part_maybe_convert (task, text_part);
-
-               if (text_part->utf_raw_content == NULL) {
-                       return;
-               }
-
-               text_part->html = rspamd_mempool_alloc0 (task->task_pool,
-                               sizeof (*text_part->html));
-               text_part->mime_part = mime_part;
-
-               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
-               text_part->content = rspamd_html_process_part_full (
-                               task->task_pool,
-                               text_part->html,
-                               text_part->utf_raw_content,
-                               &text_part->exceptions,
-                               task->urls,
-                               task->emails);
-
-               if (text_part->content->len == 0) {
-                       text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
-               }
-
-               rspamd_mempool_add_destructor (task->task_pool,
-                       (rspamd_mempool_destruct_t) free_byte_array_callback,
-                       text_part->content);
-               g_ptr_array_add (task->text_parts, text_part);
+       else if (!(found_txt || found_html)) {
+               /* Not a text part */
+               return;
        }
-       else if (found_txt) {
-               text_part =
-                       rspamd_mempool_alloc0 (task->task_pool,
-                               sizeof (struct rspamd_mime_text_part));
-               text_part->mime_part = mime_part;
-               text_part->raw.begin = mime_part->raw_data.begin;
-               text_part->raw.len = mime_part->raw_data.len;
-               text_part->parsed.begin = mime_part->parsed_data.begin;
-               text_part->parsed.len = mime_part->parsed_data.len;
-               text_part->mime_part = mime_part;
-
-               if (mime_part->parsed_data.len == 0) {
-                       text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
-                       g_ptr_array_add (task->text_parts, text_part);
-                       return;
-               }
 
-               rspamd_mime_text_part_maybe_convert (task, text_part);
+       text_part = rspamd_mempool_alloc0 (task->task_pool,
+                       sizeof (struct rspamd_mime_text_part));
+       text_part->mime_part = mime_part;
+       text_part->raw.begin = mime_part->raw_data.begin;
+       text_part->raw.len = mime_part->raw_data.len;
+       text_part->parsed.begin = mime_part->parsed_data.begin;
+       text_part->parsed.len = mime_part->parsed_data.len;
 
-               if (text_part->utf_raw_content != NULL) {
-                       /*
-                        * We ignore unconverted parts from now as it is dangerous
-                        * to treat them as text parts
-                        */
-                       text_part->content = text_part->utf_raw_content;
-                       g_ptr_array_add (task->text_parts, text_part);
-               }
-               else {
+       if (found_html) {
+               if (!rspamd_message_process_html_text_part (task, text_part)) {
                        return;
                }
        }
        else {
-               return;
+               if (!rspamd_message_process_plain_text_part (task, text_part)) {
+                       return;
+               }
        }
 
-
+       g_ptr_array_add (task->text_parts, text_part);
        mime_part->flags |= RSPAMD_MIME_PART_TEXT;
        mime_part->specific.txt = text_part;
 
@@ -867,7 +888,7 @@ rspamd_message_process_text_part (struct rspamd_task *task,
                                text_part->exceptions);
        }
 
-       text_part->ucs_len = rspamd_mime_part_create_words (task, text_part);
+       rspamd_mime_part_create_words (task, text_part);
 }
 
 /* Creates message from various data using libmagic to detect type */
@@ -1172,7 +1193,7 @@ rspamd_message_process (struct rspamd_task *task)
                struct rspamd_mime_part *part;
 
                part = g_ptr_array_index (task->parts, i);
-               rspamd_message_process_text_part (task, part);
+               rspamd_message_process_text_part_maybe (task, part);
        }
 
        rspamd_images_process (task);
@@ -1207,7 +1228,7 @@ rspamd_message_process (struct rspamd_task *task)
                                                sel = p2;
                                        }
                                        else {
-                                               if (p1->ucs_len > p2->ucs_len) {
+                                               if (p1->unicode_content->len > p2->unicode_content->len) {
                                                        sel = p1;
                                                }
                                                else {
@@ -1215,7 +1236,7 @@ rspamd_message_process (struct rspamd_task *task)
                                                }
                                        }
 
-                                       rspamd_mime_part_detect_language (task, sel, sel->ucs_len);
+                                       rspamd_mime_part_detect_language (task, sel);
 
                                        if (sel->language && sel->language[0]) {
                                                /* Propagate language */
@@ -1274,13 +1295,13 @@ rspamd_message_process (struct rspamd_task *task)
 
        PTR_ARRAY_FOREACH (task->text_parts, i, text_part) {
                if (!text_part->language) {
-                       rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len);
+                       rspamd_mime_part_detect_language (task, text_part);
                }
 
                rspamd_mime_part_extract_words (task, text_part);
 
-               if (text_part->normalized_words) {
-                       total_words += text_part->normalized_words->len;
+               if (text_part->utf_words) {
+                       total_words += text_part->utf_words->len;
                }
        }
 
index baabb762aa9e3b6636172f80a4dfe26bd5afd53e..e4b5a3d4b78d47c1e8f20539f78da05858553f9b 100644 (file)
@@ -86,20 +86,28 @@ struct rspamd_mime_text_part {
        const gchar *language;
        GPtrArray *languages;
        const gchar *real_charset;
+
+       /* Raw data in native encoding */
        rspamd_ftok_t raw;
        rspamd_ftok_t parsed; /* decoded from mime encodings */
-       GByteArray *content; /* utf8 encoded processed content */
 
-       GArray *ucs_raw_content; /* unicode raw content (of UChar) */
+       /* UTF8 content */
+       GByteArray *utf_content; /* utf8 encoded processed content */
        GByteArray *utf_raw_content; /* utf raw content */
-       GByteArray *stripped_content; /* utf content with no newlines */
+       GByteArray *utf_stripped_content; /* utf content with no newlines */
+       GArray *normalized_hashes;
+       GArray *utf_words;
+
+       /* Unicode content, used by libicu */
+       GArray *unicode_raw_content; /* unicode raw content (of UChar) */
+       GArray *unicode_content; /* unicode processed content (of UChar) */
+       GArray *unicode_words;
+
        GPtrArray *newlines;    /**< positions of newlines in text, relative to content*/
        struct html_content *html;
        GList *exceptions;      /**< list of offsets of urls                                            */
        struct rspamd_mime_part *mime_part;
-       GArray *normalized_words;
-       GArray *ucs32_words;
-       GArray *normalized_hashes;
+
        guint flags;
        guint nlines;
        guint spaces;
@@ -110,7 +118,6 @@ struct rspamd_mime_text_part {
        guint empty_lines;
        guint capital_letters;
        guint numeric_characters;
-       guint ucs_len;
 };
 
 enum rspamd_received_type {
index d3f255740b1ae43000d9bfb572f2ff900cafacc8..a0abb1bb04467fad80b24e972771e21d107ec719 100644 (file)
@@ -283,18 +283,18 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
 
        rspamd_mime_utf8_conv_init ();
        utf = text_part->utf_raw_content;
-       text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+       text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
                        sizeof (UChar), utf->len + 1);
-       text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter,
-                       (UChar *)text_part->ucs_raw_content->data,
+       text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter,
+                       (UChar *)text_part->unicode_raw_content->data,
                        utf->len + 1,
                        utf->data,
                        utf->len,
                        &uc_err);
 
        if (!U_SUCCESS (uc_err)) {
-               g_array_free (text_part->ucs_raw_content, TRUE);
-               text_part->ucs_raw_content = NULL;
+               g_array_free (text_part->unicode_raw_content, TRUE);
+               text_part->unicode_raw_content = NULL;
        }
 }
 
@@ -311,12 +311,12 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
                norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
        }
 
-       if (!text_part->ucs_raw_content) {
+       if (!text_part->unicode_raw_content) {
                return;
        }
 
-       src = (UChar *)text_part->ucs_raw_content->data;
-       nsym = text_part->ucs_raw_content->len;
+       src = (UChar *)text_part->unicode_raw_content->data;
+       nsym = text_part->unicode_raw_content->len;
 
        /* We can now check if we need to decompose */
        end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
@@ -346,8 +346,8 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task,
        }
        else {
                /* Copy normalised back */
-               memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar));
-               text_part->ucs_raw_content->len = nsym;
+               memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar));
+               text_part->unicode_raw_content->len = nsym;
                text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
        }
 
@@ -369,16 +369,16 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
        rspamd_mime_utf8_conv_init ();
 
        if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
-               text_part->ucs_raw_content) {
+               text_part->unicode_raw_content) {
                clen = ucnv_getMaxCharSize (utf8_converter);
-               dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len,
+               dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len,
                                clen);
                g_byte_array_set_size (text_part->utf_raw_content, dlen);
                r = ucnv_fromUChars (utf8_converter,
                                text_part->utf_raw_content->data,
                                dlen,
-                               (UChar *)text_part->ucs_raw_content->data,
-                               text_part->ucs_raw_content->len,
+                               (UChar *)text_part->unicode_raw_content->data,
+                               text_part->unicode_raw_content->len,
                                &uc_err);
                text_part->utf_raw_content->len = r;
        }
@@ -410,10 +410,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
        }
 
 
-       text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+       text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
                        sizeof (UChar), input->len + 1);
        r = ucnv_toUChars (conv,
-                       (UChar *)text_part->ucs_raw_content->data,
+                       (UChar *)text_part->unicode_raw_content->data,
                        input->len + 1,
                        input->data,
                        input->len,
@@ -426,7 +426,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
                return FALSE;
        }
 
-       text_part->ucs_raw_content->len = r;
+       text_part->unicode_raw_content->len = r;
        rspamd_mime_text_part_normalise (task, text_part);
 
        /* Now, convert to utf8 */
@@ -434,7 +434,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
        dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
        d = rspamd_mempool_alloc (task->task_pool, dlen);
        r = ucnv_fromUChars (utf8_converter, d, dlen,
-                       (UChar *)text_part->ucs_raw_content->data, r, &uc_err);
+                       (UChar *)text_part->unicode_raw_content->data, r, &uc_err);
 
        if (!U_SUCCESS (uc_err)) {
                g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
@@ -750,3 +750,17 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 
        SET_PART_UTF (text_part);
 }
+
+void
+rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
+{
+       UErrorCode uc_err = U_ZERO_ERROR;
+
+       g_array_set_size (dest, in->len + 1);
+       dest->len = ucnv_toUChars (utf8_converter,
+                       (UChar *)dest->data,
+                       in->len + 1,
+                       in->data,
+                       in->len,
+                       &uc_err);
+}
index 5e30efdaea3404513e3dbe523452a0a3635edafd..0754bb3484e1027a3f82a9172601e15ba3aac552 100644 (file)
@@ -86,4 +86,11 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
  */
 void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
 
+/**
+ * Converts utf8 to libicu unichars
+ * @param in
+ * @param dest
+ */
+void rspamd_utf_to_unicode (GByteArray *in, GArray *dest);
+
 #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
index c47db5761e38fdcf6faaa45d081b335f9e632300..268376e4d540db9dc5f4ca7954e73e6cca04fe3e 100644 (file)
@@ -905,8 +905,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                                                        raw = TRUE;
                                                }
 
-                                               in = part->content->data;
-                                               len = part->content->len;
+                                               in = part->utf_content->data;
+                                               len = part->utf_content->len;
                                        }
                                }
 
@@ -1006,9 +1006,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                for (i = 0; i < task->text_parts->len; i++) {
                        part = g_ptr_array_index (task->text_parts, i);
 
-                       if (part->stripped_content) {
-                               scvec[i + 1] = (guchar *)part->stripped_content->data;
-                               lenvec[i + 1] = part->stripped_content->len;
+                       if (part->utf_stripped_content) {
+                               scvec[i + 1] = (guchar *)part->utf_stripped_content->data;
+                               lenvec[i + 1] = part->utf_stripped_content->len;
                        }
                        else {
                                scvec[i + 1] = (guchar *)"";
index bfeec990bf9153d7d3ca44f0dc1056bb603c9104..07efcd182b0256ff0f71655a8716f21b3a8113a4 100644 (file)
@@ -242,20 +242,20 @@ rspamd_task_free (struct rspamd_task *task)
                for (i = 0; i < task->text_parts->len; i ++) {
                        tp = g_ptr_array_index (task->text_parts, i);
 
-                       if (tp->normalized_words) {
-                               g_array_free (tp->normalized_words, TRUE);
+                       if (tp->utf_words) {
+                               g_array_free (tp->utf_words, TRUE);
                        }
                        if (tp->normalized_hashes) {
                                g_array_free (tp->normalized_hashes, TRUE);
                        }
-                       if (tp->ucs32_words) {
-                               g_array_free (tp->ucs32_words, TRUE);
+                       if (tp->unicode_words) {
+                               g_array_free (tp->unicode_words, TRUE);
                        }
                        if (tp->languages) {
                                g_ptr_array_unref (tp->languages);
                        }
-                       if (tp->ucs_raw_content) {
-                               g_array_free (tp->ucs_raw_content, TRUE);
+                       if (tp->unicode_raw_content) {
+                               g_array_free (tp->unicode_raw_content, TRUE);
                        }
                }
 
index 653cc3570665ed21f034d7429868cd206d340ba5..9e6ab72dbe3ed20a69c07c94230d52790cc7bacd 100644 (file)
@@ -2624,7 +2624,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
 {
        struct rspamd_url_mimepart_cbdata mcbd;
 
-       if (part->stripped_content == NULL || part->stripped_content->len == 0) {
+       if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) {
                msg_warn_task ("got empty text part");
                return;
        }
@@ -2632,8 +2632,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
        mcbd.task = task;
        mcbd.part = part;
 
-       rspamd_url_find_multiple (task->task_pool, part->stripped_content->data,
-                       part->stripped_content->len, is_html, part->newlines,
+       rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
+                       part->utf_stripped_content->len, is_html, part->newlines,
                        rspamd_url_text_part_callback, &mcbd);
 }
 
index 540a9e23f5252b7151fa0e669b35b923baccea05..394173444df65f63bc08762da90789090da993c4 100644 (file)
@@ -331,8 +331,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        for (i = 0; i < task->text_parts->len; i++) {
                part = g_ptr_array_index (task->text_parts, i);
 
-               if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
-                       reserved_len += part->normalized_words->len;
+               if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
+                       reserved_len += part->utf_words->len;
                }
                /* XXX: normal window size */
                reserved_len += 5;
@@ -346,9 +346,9 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        for (i = 0; i < task->text_parts->len; i ++) {
                part = g_ptr_array_index (task->text_parts, i);
 
-               if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+               if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
                        st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
-                                       part->normalized_words, IS_PART_UTF (part),
+                                       part->utf_words, IS_PART_UTF (part),
                                        NULL, task->tokens);
                }
 
index fce98c53fcf767fbe4bd201a9052ae45d181e698..5436430fe9cdbea65fabcae36010f262dfd24ef8 100644 (file)
@@ -59,7 +59,7 @@ const gchar t_delimiters[255] = {
 
 /* Get next word from specified f_str_t buf */
 static gboolean
-rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
                gchar const **cur, rspamd_stat_token_t * token,
                GList **exceptions, gsize *rl, gboolean unused)
 {
@@ -149,7 +149,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf,
 }
 
 static gboolean
-rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
+rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf,
                gchar const **cur, rspamd_stat_token_t * token,
                GList **exceptions, gsize *rl,
                gboolean check_signature)
@@ -355,10 +355,10 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 
        switch (how) {
        case RSPAMD_TOKENIZE_RAW:
-               func = rspamd_tokenizer_get_word_compat;
+               func = rspamd_tokenizer_get_word_raw;
                break;
        case RSPAMD_TOKENIZE_UTF:
-               func = rspamd_tokenizer_get_word;
+               func = rspamd_tokenizer_get_word_utf8;
                break;
        default:
                g_assert_not_reached ();
index 8be5f98a843f2f794fa8e1b98c73cabc3aee4664..16ab142fd86f7b5272716db841cf145820763cfc 100644 (file)
@@ -28,7 +28,7 @@ struct rspamd_stat_tokenizer {
 enum rspamd_tokenize_type {
        RSPAMD_TOKENIZE_UTF = 0,
        RSPAMD_TOKENIZE_RAW,
-       RSPAMD_TOKENIZE_UCS
+       RSPAMD_TOKENIZE_UNICODE
 };
 
 /* Compare two token nodes */
index bb3406e80f352b9980f00e744b33115ff4567242..78c3e05b9da81ad54fcd4658bd084528b4884498 100644 (file)
@@ -549,16 +549,16 @@ lua_textpart_get_content (lua_State * L)
        rspamd_lua_setclass (L, "rspamd{text}", -1);
 
        if (!type) {
-               start = part->content->data;
-               len = part->content->len;
+               start = part->utf_content->data;
+               len = part->utf_content->len;
        }
        else if (strcmp (type, "content") == 0) {
-               start = part->content->data;
-               len = part->content->len;
+               start = part->utf_content->data;
+               len = part->utf_content->len;
        }
        else if (strcmp (type, "content_oneline") == 0) {
-               start = part->stripped_content->data;
-               len = part->stripped_content->len;
+               start = part->utf_stripped_content->data;
+               len = part->utf_stripped_content->len;
        }
        else if (strcmp (type, "raw_parsed") == 0) {
                start = part->parsed.begin;
@@ -618,8 +618,8 @@ lua_textpart_get_content_oneline (lua_State * L)
 
        t = lua_newuserdata (L, sizeof (*t));
        rspamd_lua_setclass (L, "rspamd{text}", -1);
-       t->start = part->stripped_content->data;
-       t->len = part->stripped_content->len;
+       t->start = part->utf_stripped_content->data;
+       t->len = part->utf_stripped_content->len;
        t->flags = 0;
 
        return 1;
@@ -636,11 +636,11 @@ lua_textpart_get_length (lua_State * L)
                return 1;
        }
 
-       if (IS_PART_EMPTY (part) || part->content == NULL) {
+       if (IS_PART_EMPTY (part) || part->utf_content == NULL) {
                lua_pushinteger (L, 0);
        }
        else {
-               lua_pushinteger (L, part->content->len);
+               lua_pushinteger (L, part->utf_content->len);
        }
 
        return 1;
@@ -721,11 +721,11 @@ lua_textpart_get_words_count (lua_State *L)
                return 1;
        }
 
-       if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
+       if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
                lua_pushinteger (L, 0);
        }
        else {
-               lua_pushinteger (L, part->normalized_words->len);
+               lua_pushinteger (L, part->utf_words->len);
        }
 
        return 1;
@@ -743,14 +743,14 @@ lua_textpart_get_words (lua_State *L)
                return luaL_error (L, "invalid arguments");
        }
 
-       if (IS_PART_EMPTY (part) || part->normalized_words == NULL) {
+       if (IS_PART_EMPTY (part) || part->utf_words == NULL) {
                lua_createtable (L, 0, 0);
        }
        else {
-               lua_createtable (L, part->normalized_words->len, 0);
+               lua_createtable (L, part->utf_words->len, 0);
 
-               for (i = 0; i < part->normalized_words->len; i ++) {
-                       w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+               for (i = 0; i < part->utf_words->len; i ++) {
+                       w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
 
                        lua_pushlstring (L, w->begin, w->len);
                        lua_rawseti (L, -2, i + 1);
@@ -876,8 +876,8 @@ struct lua_shingle_data {
 };
 
 #define STORE_TOKEN(i, t) do { \
-    if ((i) < part->normalized_words->len) { \
-        word = &g_array_index (part->normalized_words, rspamd_stat_token_t, (i)); \
+    if ((i) < part->utf_words->len) { \
+        word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \
         sd->t.begin = word->begin; \
         sd->t.len = word->len; \
     } \
@@ -936,8 +936,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L)
                /* Calculate direct hash */
                rspamd_cryptobox_hash_init (&st, key, rspamd_cryptobox_HASHKEYBYTES);
 
-               for (i = 0; i < part->normalized_words->len; i ++) {
-                       word = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+               for (i = 0; i < part->utf_words->len; i ++) {
+                       word = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
                        rspamd_cryptobox_hash_update (&st, word->begin, word->len);
                }
 
@@ -947,7 +947,7 @@ lua_textpart_get_fuzzy_hashes (lua_State * L)
                                sizeof (hexdigest));
                lua_pushlstring (L, hexdigest, sizeof (hexdigest) - 1);
 
-               sgl = rspamd_shingles_from_text (part->normalized_words, key,
+               sgl = rspamd_shingles_from_text (part->utf_words, key,
                                pool, lua_shingles_filter, part, RSPAMD_SHINGLES_MUMHASH);
 
                if (sgl == NULL) {
index 16a8ace0cee8cc1d676eeb84d23e1a4c25c70236..e6a6052d4446b0a40510b8d497af58c510959b5d 100644 (file)
@@ -262,9 +262,9 @@ lua_trie_search_mime (lua_State *L)
                for (i = 0; i < task->text_parts->len; i ++) {
                        part = g_ptr_array_index (task->text_parts, i);
 
-                       if (!IS_PART_EMPTY (part) && part->content != NULL) {
-                               text = part->content->data;
-                               len = part->content->len;
+                       if (!IS_PART_EMPTY (part) && part->utf_content != NULL) {
+                               text = part->utf_content->data;
+                               len = part->utf_content->len;
 
                                if (lua_trie_search_str (L, trie, text, len) != 0) {
                                        found = TRUE;
index 987879258159d1f3b6dde8a07668bdd886a6e72c..3c7157311f27a84cc5a31f0d1d94cc8b6cd468d2 100644 (file)
@@ -560,13 +560,13 @@ rspamd_chartable_process_part (struct rspamd_task *task,
        guint i, ncap = 0;
        gdouble cur_score = 0.0;
 
-       if (part == NULL || part->normalized_words == NULL ||
-                       part->normalized_words->len == 0) {
+       if (part == NULL || part->utf_words == NULL ||
+                       part->utf_words->len == 0) {
                return;
        }
 
-       for (i = 0; i < part->normalized_words->len; i++) {
-               w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+       for (i = 0; i < part->utf_words->len; i++) {
+               w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
 
                if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
 
@@ -588,7 +588,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
         */
        part->capital_letters += ncap;
 
-       cur_score /= (gdouble)part->normalized_words->len;
+       cur_score /= (gdouble)part->utf_words->len;
 
        if (cur_score > 2.0) {
                cur_score = 2.0;
index c0fd8aa4c6531c462adba2dc48320cbd2df9ffb0..bf08c0e46a8dc8661fc1a4ab83c400d55dd38374 100644 (file)
@@ -1196,7 +1196,7 @@ fuzzy_io_fin (void *ud)
 static GArray *
 fuzzy_preprocess_words (struct rspamd_mime_text_part *part, rspamd_mempool_t *pool)
 {
-       return part->normalized_words;
+       return part->utf_words;
 }
 
 static void
@@ -1418,8 +1418,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task,
                        rspamd_cryptobox_hash_init (&st, rule->hash_key->str,
                                        rule->hash_key->len);
 
-                       rspamd_cryptobox_hash_update (&st, part->stripped_content->data,
-                                       part->stripped_content->len);
+                       rspamd_cryptobox_hash_update (&st, part->utf_stripped_content->data,
+                                       part->utf_stripped_content->len);
 
                        if (task->subject) {
                                /* We also include subject */
@@ -2615,7 +2615,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
                        }
 
                        /* Check length of part */
-                       fac = rule->ctx->text_multiplier * part->content->len;
+                       fac = rule->ctx->text_multiplier * part->utf_content->len;
                        if ((double)min_bytes > fac) {
                                if (!rule->short_text_direct_hash) {
                                        msg_info_task (
@@ -2624,7 +2624,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
                                                                        "skip fuzzy check",
                                                        task->message_id, min_bytes,
                                                        fac,
-                                                       part->content->len,
+                                                       part->utf_content->len,
                                                        rule->ctx->text_multiplier);
                                        continue;
                                }
@@ -2635,21 +2635,21 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
                                                                        "use direct hash",
                                                        task->message_id, min_bytes,
                                                        fac,
-                                                       part->content->len,
+                                                       part->utf_content->len,
                                                        rule->ctx->text_multiplier);
                                        short_text = TRUE;
                                }
                        }
 
-                       if (part->normalized_words == NULL ||
-                                       part->normalized_words->len == 0) {
+                       if (part->utf_words == NULL ||
+                                       part->utf_words->len == 0) {
                                msg_info_task ("<%s>, part hash empty, skip fuzzy check",
                                                task->message_id);
                                continue;
                        }
 
                        if (rule->ctx->min_hash_len != 0 &&
-                                       part->normalized_words->len <
+                                       part->utf_words->len <
                                                        rule->ctx->min_hash_len) {
                                if (!rule->short_text_direct_hash) {
                                        msg_info_task (