]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] Further fixes in charset detection
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 27 Jan 2020 18:21:12 +0000 (18:21 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 27 Jan 2020 18:21:12 +0000 (18:21 +0000)
src/libmime/mime_encoding.c

index 1f130325e704c4767033a72f1a898b160fa2694e..646b31eae1d3dc8df18a3f96043306f12be9aadb 100644 (file)
@@ -573,17 +573,10 @@ rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen)
                g_assert (csd != NULL);
        }
 
-       /* If text is ascii, then we can treat it as utf8 data */
-       for (i = 0; i < inlen; i++) {
-               if ((((guchar)in[i]) & 0x80) != 0) {
-                       goto detect;
-               }
+       if (rspamd_fast_utf8_validate (in, inlen) == 0) {
+               return UTF8_CHARSET;
        }
 
-       return UTF8_CHARSET;
-
-detect:
-
        ucsdet_setText (csd, in, inlen, &uc_err);
        csm = ucsdet_detectAll (csd, &matches, &uc_err);
 
@@ -661,15 +654,11 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 {
        GError *err = NULL;
        const gchar *charset = NULL;
-       gboolean checked = FALSE, need_charset_heuristic = TRUE;
+       gboolean checked = FALSE, need_charset_heuristic = TRUE, valid_utf8 = FALSE;
        GByteArray *part_content;
        rspamd_ftok_t charset_tok;
        struct rspamd_mime_part *part = text_part->mime_part;
 
-       if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
-               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT;
-       }
-
        /* Allocate copy storage */
        part_content = g_byte_array_sized_new (text_part->parsed.len);
        memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
@@ -680,18 +669,20 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                        (rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
 
        if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
+               if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
+                       /* Valid UTF, likely all good */
+                       need_charset_heuristic = FALSE;
+                       valid_utf8 = TRUE;
+                       checked = TRUE;
+               }
+
                text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
        }
-
-       if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) {
+       else {
+               /* All 7bit characters, assume it valid utf */
                need_charset_heuristic = FALSE;
-       }
-
-       if (task->cfg && task->cfg->raw_mode) {
-               SET_PART_RAW (text_part);
-               text_part->utf_raw_content = part_content;
-
-               return;
+               valid_utf8 = TRUE;
+               checked = TRUE; /* Already valid utf, no need in further checks */
        }
 
        if (part->ct->charset.len == 0) {
@@ -706,7 +697,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                        checked = TRUE;
                        text_part->real_charset = charset;
                }
-               else {
+               else if (valid_utf8) {
                        SET_PART_UTF (text_part);
                        text_part->utf_raw_content = part_content;
                        text_part->real_charset = UTF8_CHARSET;
@@ -719,17 +710,30 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                                task->task_pool);
 
                if (charset == NULL) {
-                       charset = rspamd_mime_charset_find_by_content (part_content->data,
-                                       MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
-                       msg_info_task ("detected charset: %s", charset);
-                       checked = TRUE;
-                       text_part->real_charset = charset;
+                       /* We don't know the real charset but can try heuristic */
+                       if (need_charset_heuristic) {
+                               charset = rspamd_mime_charset_find_by_content (part_content->data,
+                                               MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
+                               msg_info_task ("detected charset: %s", charset);
+                               checked = TRUE;
+                               text_part->real_charset = charset;
+                       }
+                       else if (valid_utf8) {
+                               /* We already know that the input is valid utf, so skip heuristic */
+                               text_part->real_charset = UTF8_CHARSET;
+                       }
+               }
+               else {
+                       /*
+                        * We have detected some charset, but we don't know which one
+                        */
+                       valid_utf8 = FALSE;
                }
        }
 
-       if (charset == NULL) {
-               msg_info_task ("<%s>: has invalid charset",
-                               MESSAGE_FIELD_CHECK (task, message_id));
+       if (text_part->real_charset == NULL) {
+               msg_info_task ("<%s>: has invalid charset; original: %T",
+                               MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset);
                SET_PART_RAW (text_part);
                text_part->utf_raw_content = part_content;
 
@@ -738,32 +742,37 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 
        RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
 
-       if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
-                       part_content->len, !checked)) {
-               SET_PART_UTF (text_part);
-               text_part->utf_raw_content = part_content;
-               text_part->real_charset = UTF8_CHARSET;
-
-               return;
-       }
-       else {
-               charset = charset_tok.begin;
-
-               if (!rspamd_mime_text_part_utf8_convert (task, text_part,
-                               part_content, charset, &err)) {
-                       msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
-                                       MESSAGE_FIELD (task, message_id),
-                                       charset,
-                                       err ? err->message : "unknown problem");
-                       SET_PART_RAW (text_part);
-                       g_error_free (err);
-
+       if (!valid_utf8) {
+               if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
+                               part_content->len, !checked)) {
+                       SET_PART_UTF (text_part);
                        text_part->utf_raw_content = part_content;
+                       text_part->real_charset = UTF8_CHARSET;
+
                        return;
                }
+               else {
+                       charset = charset_tok.begin;
+
+                       if (!rspamd_mime_text_part_utf8_convert (task, text_part,
+                                       part_content, charset, &err)) {
+                               msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
+                                               MESSAGE_FIELD (task, message_id),
+                                               charset,
+                                               err ? err->message : "unknown problem");
+                               SET_PART_RAW (text_part);
+                               g_error_free (err);
+
+                               text_part->utf_raw_content = part_content;
+                               return;
+                       }
 
-               text_part->real_charset = charset;
+                       SET_PART_UTF (text_part);
+                       text_part->real_charset = charset;
+               }
+       }
+       else {
+               SET_PART_UTF (text_part);
+               text_part->utf_raw_content = part_content;
        }
-
-       SET_PART_UTF (text_part);
 }