From: Vsevolod Stakhov Date: Wed, 5 Sep 2018 16:43:20 +0000 (+0100) Subject: [Rework] Rework utf content processing in text parts X-Git-Tag: 1.8.0~186 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=a64ce9b4245153e68fbbcd9c6610b9c1ccf76493;p=rspamd.git [Rework] Rework utf content processing in text parts - Store unicode in UTF parts - Store unicode for HTML parts - Rename struct fields and split them into unicode/utf components --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 8763365af..d3c418203 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1323,7 +1323,7 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task, struct rspamd_lang_detector *d, - GArray *ucs_tokens, gsize words_len) + GArray *ucs_tokens) { khash_t(rspamd_candidates_hash) *candidates; GPtrArray *result; diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index 2d28ec65a..2ede46d02 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -61,6 +61,6 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, */ GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task, struct rspamd_lang_detector *d, - GArray *ucs_tokens, gsize words_len); + GArray *ucs_tokens); #endif diff --git a/src/libmime/message.c b/src/libmime/message.c index e6cb63504..1df980758 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -67,7 +67,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, guint i, nlen, total_len = 0, short_len = 0; gdouble avg_len = 0; - if (part->normalized_words) { + if (part->utf_words) { #ifdef WITH_SNOWBALL static GHashTable *stemmers = NULL; @@ -97,10 +97,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, #endif - for (i = 0; i < part->normalized_words->len; i++) { + for (i = 0; i < part->utf_words->len; i++) { guint64 h; - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); r = NULL; #ifdef WITH_SNOWBALL if (stem) { @@ -156,7 +156,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, } } - if (part->normalized_words && part->normalized_words->len) { + if (part->utf_words && part->utf_words->len) { gdouble *avg_len_p, *short_len_p; avg_len_p = rspamd_mempool_get_variable (task->task_pool, @@ -205,41 +205,41 @@ rspamd_mime_part_create_words (struct rspamd_task *task, /* Ugly workaround */ if (IS_PART_HTML (part)) { - part->normalized_words = rspamd_tokenize_text ( - part->stripped_content->data, - part->stripped_content->len, tok_type, task->cfg, + part->utf_words = rspamd_tokenize_text ( + part->utf_stripped_content->data, + part->utf_stripped_content->len, tok_type, task->cfg, part->exceptions, NULL); } else { - part->normalized_words = rspamd_tokenize_text ( - part->stripped_content->data, - part->stripped_content->len, tok_type, task->cfg, + part->utf_words = rspamd_tokenize_text ( + part->utf_stripped_content->data, + part->utf_stripped_content->len, tok_type, task->cfg, part->exceptions, NULL); } - if (part->normalized_words) { + if (part->utf_words) { part->normalized_hashes = g_array_sized_new (FALSE, FALSE, - sizeof (guint64), part->normalized_words->len); + sizeof (guint64), part->utf_words->len); if (IS_PART_UTF (part) && task->lang_det) { - part->ucs32_words = g_array_sized_new (FALSE, FALSE, - sizeof (rspamd_stat_token_t), part->normalized_words->len); + part->unicode_words = g_array_sized_new (FALSE, FALSE, + sizeof (rspamd_stat_token_t), part->utf_words->len); } - if (part->ucs32_words) { + if (part->unicode_words) { - for (i = 0; i < part->normalized_words->len; i++) { - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, + for (i = 0; i < part->utf_words->len; i++) { + w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { rspamd_language_detector_to_ucs (task->lang_det, task->task_pool, w, &ucs_w); - g_array_append_val (part->ucs32_words, ucs_w); + g_array_append_val (part->unicode_words, ucs_w); ucs_len += ucs_w.len; } } @@ -251,14 +251,14 @@ rspamd_mime_part_create_words (struct rspamd_task *task, static void rspamd_mime_part_detect_language (struct rspamd_task *task, - struct rspamd_mime_text_part *part, guint ucs_len) + struct rspamd_mime_text_part *part) { struct rspamd_lang_detector_res *lang; - if (part->ucs32_words) { + if (part->unicode_words) { part->languages = rspamd_language_detector_detect (task, task->lang_det, - part->ucs32_words, ucs_len); + part->unicode_words); if (part->languages->len > 0) { lang = g_ptr_array_index (part->languages, 0); @@ -289,7 +289,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, state = seen_cr; if (p > c) { last_c = *(p - 1); - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)c, p - c); } @@ -299,11 +299,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, case seen_cr: /* Double \r\r */ if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); crlf_added = TRUE; g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } part->nlines ++; @@ -326,17 +326,17 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, if (p > c) { last_c = *(p - 1); - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)c, p - c); } c = p + 1; if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); crlf_added = TRUE; } else { @@ -348,13 +348,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, /* \r\n */ if (!crlf_added) { if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *) " ", 1); crlf_added = TRUE; } g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } c = p + 1; @@ -364,11 +364,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, case seen_lf: /* Double \n\n */ if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); crlf_added = TRUE; g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } part->nlines++; @@ -414,13 +414,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, if (!crlf_added) { g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } /* Skip initial spaces */ if (G_UNLIKELY (*p == ' ')) { if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); } @@ -451,7 +451,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, switch (state) { case normal_char: - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)c, p - c); while (c < p) { @@ -479,10 +479,10 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, default: if (!crlf_added) { - g_byte_array_append (part->stripped_content, + g_byte_array_append (part->utf_stripped_content, (const guint8 *)" ", 1); g_ptr_array_add (part->newlines, - (((gpointer) (goffset) (part->stripped_content->len)))); + (((gpointer) (goffset) (part->utf_stripped_content->len)))); } part->nlines++; @@ -502,10 +502,10 @@ rspamd_normalize_text_part (struct rspamd_task *task, struct rspamd_process_exception *ex; /* Strip newlines */ - part->stripped_content = g_byte_array_sized_new (part->content->len); + part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len); part->newlines = g_ptr_array_sized_new (128); - p = (const gchar *)part->content->data; - end = p + part->content->len; + p = (const gchar *)part->utf_content->data; + end = p + part->utf_content->len; rspamd_strip_newlines_parse (p, end, part); @@ -513,7 +513,7 @@ rspamd_normalize_text_part (struct rspamd_task *task, ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); off = (goffset)g_ptr_array_index (part->newlines, i); g_ptr_array_index (part->newlines, i) = (gpointer)(goffset) - (part->stripped_content->data + off); + (part->utf_stripped_content->data + off); ex->pos = off; ex->len = 0; ex->type = RSPAMD_EXCEPTION_NEWLINE; @@ -522,7 +522,7 @@ rspamd_normalize_text_part (struct rspamd_task *task, rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) free_byte_array_callback, - part->stripped_content); + part->utf_stripped_content); rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, part->newlines); @@ -615,10 +615,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part g_assert (rspamd_multipattern_compile (gtube_matcher, NULL)); } - if (part->content && part->content->len >= sizeof (gtube_pattern_reject) && - part->content->len <= max_check_size) { - if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->content->data, - part->content->len, + if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) && + part->utf_content->len <= max_check_size) { + if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data, + part->utf_content->len, rspamd_multipattern_gtube_cb, NULL, NULL)) > 0) { switch (ret) { @@ -639,7 +639,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part msg_info_task ( "<%s>: gtube %s pattern has been found in part of length %ud", task->message_id, rspamd_action_to_str (act), - part->content->len); + part->utf_content->len); } } } @@ -655,9 +655,86 @@ exceptions_compare_func (gconstpointer a, gconstpointer b) return ea->pos - eb->pos; } +static gboolean +rspamd_message_process_plain_text_part (struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) +{ + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert (task, text_part); + + if (text_part->utf_raw_content != NULL) { + /* Different from HTML, where we also parse HTML and strip tags */ + text_part->utf_content = text_part->utf_raw_content; + text_part->unicode_content = text_part->unicode_raw_content; + } + else { + /* + * We ignore unconverted parts from now as it is dangerous + * to treat them as text parts + */ + + return FALSE; + } + + return TRUE; +} + +static gboolean +rspamd_message_process_html_text_part (struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) +{ + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; + + if (text_part->parsed.len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + + return TRUE; + } + + rspamd_mime_text_part_maybe_convert (task, text_part); + + if (text_part->utf_raw_content == NULL) { + return FALSE; + } + + text_part->html = rspamd_mempool_alloc0 (task->task_pool, + sizeof (*text_part->html)); + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; + text_part->utf_content = rspamd_html_process_part_full ( + task->task_pool, + text_part->html, + text_part->utf_raw_content, + &text_part->exceptions, + task->urls, + task->emails); + + if (text_part->utf_content->len == 0) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; + } + + /* Also add unicode content */ + text_part->unicode_content = g_array_sized_new (FALSE, FALSE, + sizeof (UChar), text_part->utf_content->len + 1); + rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content); + + rspamd_mempool_add_destructor (task->task_pool, + (rspamd_mempool_destruct_t) free_byte_array_callback, + text_part->utf_content); + rspamd_mempool_add_destructor (task->task_pool, + rspamd_array_free_hard, + text_part->unicode_content); + + return TRUE; +} + static void -rspamd_message_process_text_part (struct rspamd_task *task, - struct rspamd_mime_part *mime_part) +rspamd_message_process_text_part_maybe (struct rspamd_task *task, + struct rspamd_mime_part *mime_part) { struct rspamd_mime_text_part *text_part; rspamd_ftok_t html_tok, xhtml_tok; @@ -738,87 +815,31 @@ rspamd_message_process_text_part (struct rspamd_task *task, debug_task ("skip attachments for checking as text parts"); return; } - - if (found_html) { - text_part = rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct rspamd_mime_text_part)); - text_part->raw.begin = mime_part->raw_data.begin; - text_part->raw.len = mime_part->raw_data.len; - text_part->parsed.begin = mime_part->parsed_data.begin; - text_part->parsed.len = mime_part->parsed_data.len; - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; - text_part->mime_part = mime_part; - - if (mime_part->parsed_data.len == 0) { - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - g_ptr_array_add (task->text_parts, text_part); - return; - } - - rspamd_mime_text_part_maybe_convert (task, text_part); - - if (text_part->utf_raw_content == NULL) { - return; - } - - text_part->html = rspamd_mempool_alloc0 (task->task_pool, - sizeof (*text_part->html)); - text_part->mime_part = mime_part; - - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; - text_part->content = rspamd_html_process_part_full ( - task->task_pool, - text_part->html, - text_part->utf_raw_content, - &text_part->exceptions, - task->urls, - task->emails); - - if (text_part->content->len == 0) { - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - } - - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t) free_byte_array_callback, - text_part->content); - g_ptr_array_add (task->text_parts, text_part); + else if (!(found_txt || found_html)) { + /* Not a text part */ + return; } - else if (found_txt) { - text_part = - rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct rspamd_mime_text_part)); - text_part->mime_part = mime_part; - text_part->raw.begin = mime_part->raw_data.begin; - text_part->raw.len = mime_part->raw_data.len; - text_part->parsed.begin = mime_part->parsed_data.begin; - text_part->parsed.len = mime_part->parsed_data.len; - text_part->mime_part = mime_part; - - if (mime_part->parsed_data.len == 0) { - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; - g_ptr_array_add (task->text_parts, text_part); - return; - } - rspamd_mime_text_part_maybe_convert (task, text_part); + text_part = rspamd_mempool_alloc0 (task->task_pool, + sizeof (struct rspamd_mime_text_part)); + text_part->mime_part = mime_part; + text_part->raw.begin = mime_part->raw_data.begin; + text_part->raw.len = mime_part->raw_data.len; + text_part->parsed.begin = mime_part->parsed_data.begin; + text_part->parsed.len = mime_part->parsed_data.len; - if (text_part->utf_raw_content != NULL) { - /* - * We ignore unconverted parts from now as it is dangerous - * to treat them as text parts - */ - text_part->content = text_part->utf_raw_content; - g_ptr_array_add (task->text_parts, text_part); - } - else { + if (found_html) { + if (!rspamd_message_process_html_text_part (task, text_part)) { return; } } else { - return; + if (!rspamd_message_process_plain_text_part (task, text_part)) { + return; + } } - + g_ptr_array_add (task->text_parts, text_part); mime_part->flags |= RSPAMD_MIME_PART_TEXT; mime_part->specific.txt = text_part; @@ -867,7 +888,7 @@ rspamd_message_process_text_part (struct rspamd_task *task, text_part->exceptions); } - text_part->ucs_len = rspamd_mime_part_create_words (task, text_part); + rspamd_mime_part_create_words (task, text_part); } /* Creates message from various data using libmagic to detect type */ @@ -1172,7 +1193,7 @@ rspamd_message_process (struct rspamd_task *task) struct rspamd_mime_part *part; part = g_ptr_array_index (task->parts, i); - rspamd_message_process_text_part (task, part); + rspamd_message_process_text_part_maybe (task, part); } rspamd_images_process (task); @@ -1207,7 +1228,7 @@ rspamd_message_process (struct rspamd_task *task) sel = p2; } else { - if (p1->ucs_len > p2->ucs_len) { + if (p1->unicode_content->len > p2->unicode_content->len) { sel = p1; } else { @@ -1215,7 +1236,7 @@ rspamd_message_process (struct rspamd_task *task) } } - rspamd_mime_part_detect_language (task, sel, sel->ucs_len); + rspamd_mime_part_detect_language (task, sel); if (sel->language && sel->language[0]) { /* Propagate language */ @@ -1274,13 +1295,13 @@ rspamd_message_process (struct rspamd_task *task) PTR_ARRAY_FOREACH (task->text_parts, i, text_part) { if (!text_part->language) { - rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len); + rspamd_mime_part_detect_language (task, text_part); } rspamd_mime_part_extract_words (task, text_part); - if (text_part->normalized_words) { - total_words += text_part->normalized_words->len; + if (text_part->utf_words) { + total_words += text_part->utf_words->len; } } diff --git a/src/libmime/message.h b/src/libmime/message.h index baabb762a..e4b5a3d4b 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -86,20 +86,28 @@ struct rspamd_mime_text_part { const gchar *language; GPtrArray *languages; const gchar *real_charset; + + /* Raw data in native encoding */ rspamd_ftok_t raw; rspamd_ftok_t parsed; /* decoded from mime encodings */ - GByteArray *content; /* utf8 encoded processed content */ - GArray *ucs_raw_content; /* unicode raw content (of UChar) */ + /* UTF8 content */ + GByteArray *utf_content; /* utf8 encoded processed content */ GByteArray *utf_raw_content; /* utf raw content */ - GByteArray *stripped_content; /* utf content with no newlines */ + GByteArray *utf_stripped_content; /* utf content with no newlines */ + GArray *normalized_hashes; + GArray *utf_words; + + /* Unicode content, used by libicu */ + GArray *unicode_raw_content; /* unicode raw content (of UChar) */ + GArray *unicode_content; /* unicode processed content (of UChar) */ + GArray *unicode_words; + GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ struct html_content *html; GList *exceptions; /**< list of offsets of urls */ struct rspamd_mime_part *mime_part; - GArray *normalized_words; - GArray *ucs32_words; - GArray *normalized_hashes; + guint flags; guint nlines; guint spaces; @@ -110,7 +118,6 @@ struct rspamd_mime_text_part { guint empty_lines; guint capital_letters; guint numeric_characters; - guint ucs_len; }; enum rspamd_received_type { diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index d3f255740..a0abb1bb0 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -283,18 +283,18 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task, rspamd_mime_utf8_conv_init (); utf = text_part->utf_raw_content; - text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE, + text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), utf->len + 1); - text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter, - (UChar *)text_part->ucs_raw_content->data, + text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter, + (UChar *)text_part->unicode_raw_content->data, utf->len + 1, utf->data, utf->len, &uc_err); if (!U_SUCCESS (uc_err)) { - g_array_free (text_part->ucs_raw_content, TRUE); - text_part->ucs_raw_content = NULL; + g_array_free (text_part->unicode_raw_content, TRUE); + text_part->unicode_raw_content = NULL; } } @@ -311,12 +311,12 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); } - if (!text_part->ucs_raw_content) { + if (!text_part->unicode_raw_content) { return; } - src = (UChar *)text_part->ucs_raw_content->data; - nsym = text_part->ucs_raw_content->len; + src = (UChar *)text_part->unicode_raw_content->data; + nsym = text_part->unicode_raw_content->len; /* We can now check if we need to decompose */ end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err); @@ -346,8 +346,8 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, } else { /* Copy normalised back */ - memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar)); - text_part->ucs_raw_content->len = nsym; + memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar)); + text_part->unicode_raw_content->len = nsym; text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED; } @@ -369,16 +369,16 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task, rspamd_mime_utf8_conv_init (); if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) && - text_part->ucs_raw_content) { + text_part->unicode_raw_content) { clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len, + dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len, clen); g_byte_array_set_size (text_part->utf_raw_content, dlen); r = ucnv_fromUChars (utf8_converter, text_part->utf_raw_content->data, dlen, - (UChar *)text_part->ucs_raw_content->data, - text_part->ucs_raw_content->len, + (UChar *)text_part->unicode_raw_content->data, + text_part->unicode_raw_content->len, &uc_err); text_part->utf_raw_content->len = r; } @@ -410,10 +410,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, } - text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE, + text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), input->len + 1); r = ucnv_toUChars (conv, - (UChar *)text_part->ucs_raw_content->data, + (UChar *)text_part->unicode_raw_content->data, input->len + 1, input->data, input->len, @@ -426,7 +426,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, return FALSE; } - text_part->ucs_raw_content->len = r; + text_part->unicode_raw_content->len = r; rspamd_mime_text_part_normalise (task, text_part); /* Now, convert to utf8 */ @@ -434,7 +434,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); d = rspamd_mempool_alloc (task->task_pool, dlen); r = ucnv_fromUChars (utf8_converter, d, dlen, - (UChar *)text_part->ucs_raw_content->data, r, &uc_err); + (UChar *)text_part->unicode_raw_content->data, r, &uc_err); if (!U_SUCCESS (uc_err)) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, @@ -750,3 +750,17 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, SET_PART_UTF (text_part); } + +void +rspamd_utf_to_unicode (GByteArray *in, GArray *dest) +{ + UErrorCode uc_err = U_ZERO_ERROR; + + g_array_set_size (dest, in->len + 1); + dest->len = ucnv_toUChars (utf8_converter, + (UChar *)dest->data, + in->len + 1, + in->data, + in->len, + &uc_err); +} diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h index 5e30efdae..0754bb348 100644 --- a/src/libmime/mime_encoding.h +++ b/src/libmime/mime_encoding.h @@ -86,4 +86,11 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, */ void rspamd_mime_charset_utf_enforce (gchar *in, gsize len); +/** + * Converts utf8 to libicu unichars + * @param in + * @param dest + */ +void rspamd_utf_to_unicode (GByteArray *in, GArray *dest); + #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */ diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index c47db5761..268376e4d 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -905,8 +905,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, raw = TRUE; } - in = part->content->data; - len = part->content->len; + in = part->utf_content->data; + len = part->utf_content->len; } } @@ -1006,9 +1006,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); - if (part->stripped_content) { - scvec[i + 1] = (guchar *)part->stripped_content->data; - lenvec[i + 1] = part->stripped_content->len; + if (part->utf_stripped_content) { + scvec[i + 1] = (guchar *)part->utf_stripped_content->data; + lenvec[i + 1] = part->utf_stripped_content->len; } else { scvec[i + 1] = (guchar *)""; diff --git a/src/libserver/task.c b/src/libserver/task.c index bfeec990b..07efcd182 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -242,20 +242,20 @@ rspamd_task_free (struct rspamd_task *task) for (i = 0; i < task->text_parts->len; i ++) { tp = g_ptr_array_index (task->text_parts, i); - if (tp->normalized_words) { - g_array_free (tp->normalized_words, TRUE); + if (tp->utf_words) { + g_array_free (tp->utf_words, TRUE); } if (tp->normalized_hashes) { g_array_free (tp->normalized_hashes, TRUE); } - if (tp->ucs32_words) { - g_array_free (tp->ucs32_words, TRUE); + if (tp->unicode_words) { + g_array_free (tp->unicode_words, TRUE); } if (tp->languages) { g_ptr_array_unref (tp->languages); } - if (tp->ucs_raw_content) { - g_array_free (tp->ucs_raw_content, TRUE); + if (tp->unicode_raw_content) { + g_array_free (tp->unicode_raw_content, TRUE); } } diff --git a/src/libserver/url.c b/src/libserver/url.c index 653cc3570..9e6ab72db 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2624,7 +2624,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, { struct rspamd_url_mimepart_cbdata mcbd; - if (part->stripped_content == NULL || part->stripped_content->len == 0) { + if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) { msg_warn_task ("got empty text part"); return; } @@ -2632,8 +2632,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, mcbd.task = task; mcbd.part = part; - rspamd_url_find_multiple (task->task_pool, part->stripped_content->data, - part->stripped_content->len, is_html, part->newlines, + rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data, + part->utf_stripped_content->len, is_html, part->newlines, rspamd_url_text_part_callback, &mcbd); } diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c index 540a9e23f..394173444 100644 --- a/src/libstat/stat_process.c +++ b/src/libstat/stat_process.c @@ -331,8 +331,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, for (i = 0; i < task->text_parts->len; i++) { part = g_ptr_array_index (task->text_parts, i); - if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { - reserved_len += part->normalized_words->len; + if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { + reserved_len += part->utf_words->len; } /* XXX: normal window size */ reserved_len += 5; @@ -346,9 +346,9 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); - if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { + if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, - part->normalized_words, IS_PART_UTF (part), + part->utf_words, IS_PART_UTF (part), NULL, task->tokens); } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index fce98c53f..5436430fe 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -59,7 +59,7 @@ const gchar t_delimiters[255] = { /* Get next word from specified f_str_t buf */ static gboolean -rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, +rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gsize *rl, gboolean unused) { @@ -149,7 +149,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, } static gboolean -rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, +rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf, gchar const **cur, rspamd_stat_token_t * token, GList **exceptions, gsize *rl, gboolean check_signature) @@ -355,10 +355,10 @@ rspamd_tokenize_text (const gchar *text, gsize len, switch (how) { case RSPAMD_TOKENIZE_RAW: - func = rspamd_tokenizer_get_word_compat; + func = rspamd_tokenizer_get_word_raw; break; case RSPAMD_TOKENIZE_UTF: - func = rspamd_tokenizer_get_word; + func = rspamd_tokenizer_get_word_utf8; break; default: g_assert_not_reached (); diff --git a/src/libstat/tokenizers/tokenizers.h b/src/libstat/tokenizers/tokenizers.h index 8be5f98a8..16ab142fd 100644 --- a/src/libstat/tokenizers/tokenizers.h +++ b/src/libstat/tokenizers/tokenizers.h @@ -28,7 +28,7 @@ struct rspamd_stat_tokenizer { enum rspamd_tokenize_type { RSPAMD_TOKENIZE_UTF = 0, RSPAMD_TOKENIZE_RAW, - RSPAMD_TOKENIZE_UCS + RSPAMD_TOKENIZE_UNICODE }; /* Compare two token nodes */ diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index bb3406e80..78c3e05b9 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -549,16 +549,16 @@ lua_textpart_get_content (lua_State * L) rspamd_lua_setclass (L, "rspamd{text}", -1); if (!type) { - start = part->content->data; - len = part->content->len; + start = part->utf_content->data; + len = part->utf_content->len; } else if (strcmp (type, "content") == 0) { - start = part->content->data; - len = part->content->len; + start = part->utf_content->data; + len = part->utf_content->len; } else if (strcmp (type, "content_oneline") == 0) { - start = part->stripped_content->data; - len = part->stripped_content->len; + start = part->utf_stripped_content->data; + len = part->utf_stripped_content->len; } else if (strcmp (type, "raw_parsed") == 0) { start = part->parsed.begin; @@ -618,8 +618,8 @@ lua_textpart_get_content_oneline (lua_State * L) t = lua_newuserdata (L, sizeof (*t)); rspamd_lua_setclass (L, "rspamd{text}", -1); - t->start = part->stripped_content->data; - t->len = part->stripped_content->len; + t->start = part->utf_stripped_content->data; + t->len = part->utf_stripped_content->len; t->flags = 0; return 1; @@ -636,11 +636,11 @@ lua_textpart_get_length (lua_State * L) return 1; } - if (IS_PART_EMPTY (part) || part->content == NULL) { + if (IS_PART_EMPTY (part) || part->utf_content == NULL) { lua_pushinteger (L, 0); } else { - lua_pushinteger (L, part->content->len); + lua_pushinteger (L, part->utf_content->len); } return 1; @@ -721,11 +721,11 @@ lua_textpart_get_words_count (lua_State *L) return 1; } - if (IS_PART_EMPTY (part) || part->normalized_words == NULL) { + if (IS_PART_EMPTY (part) || part->utf_words == NULL) { lua_pushinteger (L, 0); } else { - lua_pushinteger (L, part->normalized_words->len); + lua_pushinteger (L, part->utf_words->len); } return 1; @@ -743,14 +743,14 @@ lua_textpart_get_words (lua_State *L) return luaL_error (L, "invalid arguments"); } - if (IS_PART_EMPTY (part) || part->normalized_words == NULL) { + if (IS_PART_EMPTY (part) || part->utf_words == NULL) { lua_createtable (L, 0, 0); } else { - lua_createtable (L, part->normalized_words->len, 0); + lua_createtable (L, part->utf_words->len, 0); - for (i = 0; i < part->normalized_words->len; i ++) { - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + for (i = 0; i < part->utf_words->len; i ++) { + w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); lua_pushlstring (L, w->begin, w->len); lua_rawseti (L, -2, i + 1); @@ -876,8 +876,8 @@ struct lua_shingle_data { }; #define STORE_TOKEN(i, t) do { \ - if ((i) < part->normalized_words->len) { \ - word = &g_array_index (part->normalized_words, rspamd_stat_token_t, (i)); \ + if ((i) < part->utf_words->len) { \ + word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \ sd->t.begin = word->begin; \ sd->t.len = word->len; \ } \ @@ -936,8 +936,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L) /* Calculate direct hash */ rspamd_cryptobox_hash_init (&st, key, rspamd_cryptobox_HASHKEYBYTES); - for (i = 0; i < part->normalized_words->len; i ++) { - word = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + for (i = 0; i < part->utf_words->len; i ++) { + word = &g_array_index (part->utf_words, rspamd_stat_token_t, i); rspamd_cryptobox_hash_update (&st, word->begin, word->len); } @@ -947,7 +947,7 @@ lua_textpart_get_fuzzy_hashes (lua_State * L) sizeof (hexdigest)); lua_pushlstring (L, hexdigest, sizeof (hexdigest) - 1); - sgl = rspamd_shingles_from_text (part->normalized_words, key, + sgl = rspamd_shingles_from_text (part->utf_words, key, pool, lua_shingles_filter, part, RSPAMD_SHINGLES_MUMHASH); if (sgl == NULL) { diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c index 16a8ace0c..e6a6052d4 100644 --- a/src/lua/lua_trie.c +++ b/src/lua/lua_trie.c @@ -262,9 +262,9 @@ lua_trie_search_mime (lua_State *L) for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); - if (!IS_PART_EMPTY (part) && part->content != NULL) { - text = part->content->data; - len = part->content->len; + if (!IS_PART_EMPTY (part) && part->utf_content != NULL) { + text = part->utf_content->data; + len = part->utf_content->len; if (lua_trie_search_str (L, trie, text, len) != 0) { found = TRUE; diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 987879258..3c7157311 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -560,13 +560,13 @@ rspamd_chartable_process_part (struct rspamd_task *task, guint i, ncap = 0; gdouble cur_score = 0.0; - if (part == NULL || part->normalized_words == NULL || - part->normalized_words->len == 0) { + if (part == NULL || part->utf_words == NULL || + part->utf_words->len == 0) { return; } - for (i = 0; i < part->normalized_words->len; i++) { - w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + for (i = 0; i < part->utf_words->len; i++) { + w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { @@ -588,7 +588,7 @@ rspamd_chartable_process_part (struct rspamd_task *task, */ part->capital_letters += ncap; - cur_score /= (gdouble)part->normalized_words->len; + cur_score /= (gdouble)part->utf_words->len; if (cur_score > 2.0) { cur_score = 2.0; diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index c0fd8aa4c..bf08c0e46 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -1196,7 +1196,7 @@ fuzzy_io_fin (void *ud) static GArray * fuzzy_preprocess_words (struct rspamd_mime_text_part *part, rspamd_mempool_t *pool) { - return part->normalized_words; + return part->utf_words; } static void @@ -1418,8 +1418,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task, rspamd_cryptobox_hash_init (&st, rule->hash_key->str, rule->hash_key->len); - rspamd_cryptobox_hash_update (&st, part->stripped_content->data, - part->stripped_content->len); + rspamd_cryptobox_hash_update (&st, part->utf_stripped_content->data, + part->utf_stripped_content->len); if (task->subject) { /* We also include subject */ @@ -2615,7 +2615,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, } /* Check length of part */ - fac = rule->ctx->text_multiplier * part->content->len; + fac = rule->ctx->text_multiplier * part->utf_content->len; if ((double)min_bytes > fac) { if (!rule->short_text_direct_hash) { msg_info_task ( @@ -2624,7 +2624,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, "skip fuzzy check", task->message_id, min_bytes, fac, - part->content->len, + part->utf_content->len, rule->ctx->text_multiplier); continue; } @@ -2635,21 +2635,21 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, "use direct hash", task->message_id, min_bytes, fac, - part->content->len, + part->utf_content->len, rule->ctx->text_multiplier); short_text = TRUE; } } - if (part->normalized_words == NULL || - part->normalized_words->len == 0) { + if (part->utf_words == NULL || + part->utf_words->len == 0) { msg_info_task ("<%s>, part hash empty, skip fuzzy check", task->message_id); continue; } if (rule->ctx->min_hash_len != 0 && - part->normalized_words->len < + part->utf_words->len < rule->ctx->min_hash_len) { if (!rule->short_text_direct_hash) { msg_info_task (