From a455c6542212948969b6c115344e9d059606407a Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 27 Jan 2020 18:21:12 +0000 Subject: [PATCH] [Fix] Further fixes in charset detection --- src/libmime/mime_encoding.c | 119 +++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 55 deletions(-) diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index 1f130325e..646b31eae 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -573,17 +573,10 @@ rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen) g_assert (csd != NULL); } - /* If text is ascii, then we can treat it as utf8 data */ - for (i = 0; i < inlen; i++) { - if ((((guchar)in[i]) & 0x80) != 0) { - goto detect; - } + if (rspamd_fast_utf8_validate (in, inlen) == 0) { + return UTF8_CHARSET; } - return UTF8_CHARSET; - -detect: - ucsdet_setText (csd, in, inlen, &uc_err); csm = ucsdet_detectAll (csd, &matches, &uc_err); @@ -661,15 +654,11 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, { GError *err = NULL; const gchar *charset = NULL; - gboolean checked = FALSE, need_charset_heuristic = TRUE; + gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE; GByteArray *part_content; rspamd_ftok_t charset_tok; struct rspamd_mime_part *part = text_part->mime_part; - if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) { - text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT; - } - /* Allocate copy storage */ part_content = g_byte_array_sized_new (text_part->parsed.len); memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len); @@ -680,18 +669,20 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, (rspamd_mempool_destruct_t)g_byte_array_unref, part_content); if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) { + if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) { + /* Valid UTF, likely all good */ + need_charset_heuristic = FALSE; + valid_utf8 = TRUE; + checked = TRUE; + } + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED; } - - if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) { + else { + /* All 7bit characters, assume it valid utf */ need_charset_heuristic = FALSE; - } - - if (task->cfg && task->cfg->raw_mode) { - SET_PART_RAW (text_part); - text_part->utf_raw_content = part_content; - - return; + valid_utf8 = TRUE; + checked = TRUE; /* Already valid utf, no need in further checks */ } if (part->ct->charset.len == 0) { @@ -706,7 +697,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, checked = TRUE; text_part->real_charset = charset; } - else { + else if (valid_utf8) { SET_PART_UTF (text_part); text_part->utf_raw_content = part_content; text_part->real_charset = UTF8_CHARSET; @@ -719,17 +710,30 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, task->task_pool); if (charset == NULL) { - charset = rspamd_mime_charset_find_by_content (part_content->data, - MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len)); - msg_info_task ("detected charset: %s", charset); - checked = TRUE; - text_part->real_charset = charset; + /* We don't know the real charset but can try heuristic */ + if (need_charset_heuristic) { + charset = rspamd_mime_charset_find_by_content (part_content->data, + MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len)); + msg_info_task ("detected charset: %s", charset); + checked = TRUE; + text_part->real_charset = charset; + } + else if (valid_utf8) { + /* We already know that the input is valid utf, so skip heuristic */ + text_part->real_charset = UTF8_CHARSET; + } + } + else { + /* + * We have detected some charset, but we don't know which one + */ + valid_utf8 = FALSE; } } - if (charset == NULL) { - msg_info_task ("<%s>: has invalid charset", - MESSAGE_FIELD_CHECK (task, message_id)); + if (text_part->real_charset == NULL) { + msg_info_task ("<%s>: has invalid charset; original: %T", + MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset); SET_PART_RAW (text_part); text_part->utf_raw_content = part_content; @@ -738,32 +742,37 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, RSPAMD_FTOK_FROM_STR (&charset_tok, charset); - if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data, - part_content->len, !checked)) { - SET_PART_UTF (text_part); - text_part->utf_raw_content = part_content; - text_part->real_charset = UTF8_CHARSET; - - return; - } - else { - charset = charset_tok.begin; - - if (!rspamd_mime_text_part_utf8_convert (task, text_part, - part_content, charset, &err)) { - msg_warn_task ("<%s>: cannot convert from %s to utf8: %s", - MESSAGE_FIELD (task, message_id), - charset, - err ? err->message : "unknown problem"); - SET_PART_RAW (text_part); - g_error_free (err); - + if (!valid_utf8) { + if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data, + part_content->len, !checked)) { + SET_PART_UTF (text_part); text_part->utf_raw_content = part_content; + text_part->real_charset = UTF8_CHARSET; + return; } + else { + charset = charset_tok.begin; + + if (!rspamd_mime_text_part_utf8_convert (task, text_part, + part_content, charset, &err)) { + msg_warn_task ("<%s>: cannot convert from %s to utf8: %s", + MESSAGE_FIELD (task, message_id), + charset, + err ? err->message : "unknown problem"); + SET_PART_RAW (text_part); + g_error_free (err); + + text_part->utf_raw_content = part_content; + return; + } - text_part->real_charset = charset; + SET_PART_UTF (text_part); + text_part->real_charset = charset; + } + } + else { + SET_PART_UTF (text_part); + text_part->utf_raw_content = part_content; } - - SET_PART_UTF (text_part); } -- 2.39.5