/*- * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "config.h" #include "libutil/mem_pool.h" #include "libutil/regexp.h" #include "libutil/hash.h" #include "libserver/task.h" #include "mime_encoding.h" #include "message.h" #include #include #if U_ICU_VERSION_MAJOR_NUM >= 44 #include #endif #include #define UTF8_CHARSET "UTF-8" #define RSPAMD_CHARSET_FLAG_UTF (1 << 0) #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1) #define RSPAMD_CHARSET_CACHE_SIZE 32 #define RSPAMD_CHARSET_MAX_CONTENT 128 #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF) #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) static rspamd_regexp_t *utf_compatible_re = NULL; UConverter *utf8_converter = NULL; #if U_ICU_VERSION_MAJOR_NUM >= 44 static const UNormalizer2 *norm = NULL; #endif struct rspamd_charset_substitution { const gchar *input; const gchar *canon; gint flags; }; #include "mime_encoding_list.h" static GHashTable *sub_hash = NULL; static GQuark rspamd_iconv_error_quark (void) { return g_quark_from_static_string ("iconv error"); } static UConverter * rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) { const gchar *canon_name; static rspamd_lru_hash_t *cache; UConverter *conv; if (cache == NULL) { cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, g_free, (GDestroyNotify)ucnv_close, rspamd_str_hash, rspamd_str_equal); } canon_name = ucnv_getStandardName (enc, "IANA", err); if (canon_name == NULL) { return NULL; } conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0); if (conv == NULL) { conv = ucnv_open (canon_name, err); if (conv != NULL) { ucnv_setToUCallBack (conv, UCNV_TO_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, err); rspamd_lru_hash_insert (cache, g_strdup (canon_name), conv, 0, 0); } } return conv; } static inline void rspamd_mime_utf8_conv_init (void) { if (utf8_converter == NULL) { UErrorCode uc_err = U_ZERO_ERROR; utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err); if (!U_SUCCESS (uc_err)) { msg_err ("FATAL error: cannot open converter for utf8: %s", u_errorName (uc_err)); g_assert_not_reached (); } ucnv_setFromUCallBack (utf8_converter, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &uc_err); ucnv_setToUCallBack (utf8_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &uc_err); } } static void rspamd_mime_encoding_substitute_init (void) { guint i; sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal); for (i = 0; i < G_N_ELEMENTS (sub); i ++) { g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]); } } static void rspamd_charset_normalize (gchar *in) { /* * This is a simple routine to validate input charset * we just check that charset starts with alphanumeric and ends * with alphanumeric */ gchar *begin, *end; gboolean changed = FALSE; begin = in; while (*begin && !g_ascii_isalnum (*begin)) { begin ++; changed = TRUE; } end = begin + strlen (begin) - 1; while (end > begin && !g_ascii_isalnum (*end)) { end --; changed = TRUE; } if (changed) { memmove (in, begin, end - begin + 2); *(end + 1) = '\0'; } } const gchar * rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool) { gchar *ret = NULL, *h, *t; struct rspamd_charset_substitution *s; UErrorCode uc_err = U_ZERO_ERROR; if (sub_hash == NULL) { rspamd_mime_encoding_substitute_init (); } ret = rspamd_mempool_ftokdup (pool, in); rspamd_charset_normalize (ret); if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) || (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) { /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */ h = ret; t = ret; while (*h != '\0') { if (*h != '-') { *t++ = *h; } h ++; } *t = '\0'; } s = g_hash_table_lookup (sub_hash, ret); if (s) { return ucnv_getStandardName (s->canon, "IANA", &uc_err); } return ucnv_getStandardName (ret, "IANA", &uc_err); } gchar * rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, gchar *input, gsize len, const gchar *in_enc, gsize *olen, GError **err) { gchar *d; gint32 r, clen, dlen; UChar *tmp_buf; UErrorCode uc_err = U_ZERO_ERROR; UConverter *conv; rspamd_mime_utf8_conv_init (); conv = rspamd_mime_get_converter_cached (in_enc, &uc_err); if (conv == NULL) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot open converter for %s: %s", in_enc, u_errorName (uc_err)); return NULL; } tmp_buf = g_new (UChar, len + 1); uc_err = U_ZERO_ERROR; r = ucnv_toUChars (conv, tmp_buf, len + 1, input, len, &uc_err); if (!U_SUCCESS (uc_err)) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot convert data to unicode from %s: %s", in_enc, u_errorName (uc_err)); g_free (tmp_buf); return NULL; } /* Now, convert to utf8 */ clen = ucnv_getMaxCharSize (utf8_converter); dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); d = rspamd_mempool_alloc (pool, dlen); r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err); if (!U_SUCCESS (uc_err)) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot convert data from unicode from %s: %s", in_enc, u_errorName (uc_err)); g_free (tmp_buf); return NULL; } msg_info_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d", in_enc, len, r); g_free (tmp_buf); if (olen) { *olen = r; } return d; } static void rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task, struct rspamd_mime_text_part *text_part) { GByteArray *utf; UErrorCode uc_err = U_ZERO_ERROR; rspamd_mime_utf8_conv_init (); utf = text_part->utf_raw_content; text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), utf->len + 1); text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter, (UChar *)text_part->unicode_raw_content->data, utf->len + 1, utf->data, utf->len, &uc_err); if (!U_SUCCESS (uc_err)) { g_array_free (text_part->unicode_raw_content, TRUE); text_part->unicode_raw_content = NULL; } } static void rspamd_mime_text_part_normalise (struct rspamd_task *task, struct rspamd_mime_text_part *text_part) { #if U_ICU_VERSION_MAJOR_NUM >= 44 UErrorCode uc_err = U_ZERO_ERROR; gint32 nsym, end; UChar *src = NULL, *dest = NULL; if (norm == NULL) { norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); } if (!text_part->unicode_raw_content) { return; } src = (UChar *)text_part->unicode_raw_content->data; nsym = text_part->unicode_raw_content->len; /* We can now check if we need to decompose */ end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err); if (!U_SUCCESS (uc_err)) { msg_warn_task ("cannot normalise URL, cannot check normalisation: %s", u_errorName (uc_err)); return; } if (end == nsym) { /* Already normalised */ return; } text_part->flags |= RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL; dest = g_malloc (nsym * sizeof (*dest)); memcpy (dest, src, end * sizeof (*dest)); nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym, src + end, nsym - end, &uc_err); if (!U_SUCCESS (uc_err)) { if (uc_err != U_BUFFER_OVERFLOW_ERROR) { msg_warn_task ("cannot normalise URL: %s", u_errorName (uc_err)); } } else { /* Copy normalised back */ memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar)); text_part->unicode_raw_content->len = nsym; text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED; } g_free (dest); #endif } /* * Recode utf from normalised unichars if needed */ static void rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task, struct rspamd_mime_text_part *text_part) { UErrorCode uc_err = U_ZERO_ERROR; guint clen, dlen; gint r; rspamd_mime_utf8_conv_init (); if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) && text_part->unicode_raw_content) { clen = ucnv_getMaxCharSize (utf8_converter); dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len, clen); g_byte_array_set_size (text_part->utf_raw_content, dlen); r = ucnv_fromUChars (utf8_converter, text_part->utf_raw_content->data, dlen, (UChar *)text_part->unicode_raw_content->data, text_part->unicode_raw_content->len, &uc_err); text_part->utf_raw_content->len = r; } } static gboolean rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, struct rspamd_mime_text_part *text_part, GByteArray *input, const gchar *charset, GError **err) { gchar *d; gint32 r, clen, dlen; UErrorCode uc_err = U_ZERO_ERROR; UConverter *conv; rspamd_mime_utf8_conv_init (); conv = rspamd_mime_get_converter_cached (charset, &uc_err); if (conv == NULL) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot open converter for %s: %s", charset, u_errorName (uc_err)); return FALSE; } text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, sizeof (UChar), input->len + 1); r = ucnv_toUChars (conv, (UChar *)text_part->unicode_raw_content->data, input->len + 1, input->data, input->len, &uc_err); if (!U_SUCCESS (uc_err)) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot convert data to unicode from %s: %s", charset, u_errorName (uc_err)); return FALSE; } text_part->unicode_raw_content->len = r; rspamd_mime_text_part_normalise (task, text_part); /* Now, convert to utf8 */ clen = ucnv_getMaxCharSize (utf8_converter); dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); d = rspamd_mempool_alloc (task->task_pool, dlen); r = ucnv_fromUChars (utf8_converter, d, dlen, (UChar *)text_part->unicode_raw_content->data, r, &uc_err); if (!U_SUCCESS (uc_err)) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot convert data from unicode from %s: %s", charset, u_errorName (uc_err)); return FALSE; } msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d", charset, input->len, r); text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool, sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4); text_part->utf_raw_content->data = d; text_part->utf_raw_content->len = r; return TRUE; } gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in, GByteArray *out, const gchar *enc) { gint32 r, clen, dlen; UChar *tmp_buf; UErrorCode uc_err = U_ZERO_ERROR; UConverter *conv; rspamd_ftok_t charset_tok; RSPAMD_FTOK_FROM_STR (&charset_tok, enc); if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len, FALSE)) { g_byte_array_set_size (out, in->len); memcpy (out->data, in->data, out->len); return TRUE; } rspamd_mime_utf8_conv_init (); conv = rspamd_mime_get_converter_cached (enc, &uc_err); if (conv == NULL) { return FALSE; } tmp_buf = g_new (UChar, in->len + 1); uc_err = U_ZERO_ERROR; r = ucnv_toUChars (conv, tmp_buf, in->len + 1, in->data, in->len, &uc_err); if (!U_SUCCESS (uc_err)) { g_free (tmp_buf); return FALSE; } /* Now, convert to utf8 */ clen = ucnv_getMaxCharSize (utf8_converter); dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); g_byte_array_set_size (out, dlen); r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err); if (!U_SUCCESS (uc_err)) { g_free (tmp_buf); return FALSE; } g_free (tmp_buf); out->len = r; return TRUE; } void rspamd_mime_charset_utf_enforce (gchar *in, gsize len) { const gchar *end, *p; gsize remain = len; /* Now we validate input and replace bad characters with '?' symbol */ p = in; while (remain > 0 && !g_utf8_validate (p, remain, &end)) { gchar *valid; if (end >= in + len) { if (p < in + len) { memset ((gchar *)p, '?', (in + len) - p); } break; } valid = g_utf8_find_next_char (end, in + len); if (!valid) { valid = in + len; } if (valid > end) { memset ((gchar *)end, '?', valid - end); p = valid; remain = (in + len) - p; } else { break; } } } static const char * rspamd_mime_charset_find_by_content (gchar *in, gsize inlen) { static UCharsetDetector *csd; const UCharsetMatch **csm, *sel = NULL; UErrorCode uc_err = U_ZERO_ERROR; gint32 matches, i, max_conf = G_MININT32, conf; gdouble mean = 0.0, stddev = 0.0; if (csd == NULL) { csd = ucsdet_open (&uc_err); g_assert (csd != NULL); } /* If text is ascii, then we can treat it as utf8 data */ for (i = 0; i < inlen; i++) { if ((((guchar)in[i]) & 0x80) != 0) { goto detect; } } return UTF8_CHARSET; detect: ucsdet_setText (csd, in, inlen, &uc_err); csm = ucsdet_detectAll(csd, &matches, &uc_err); for (i = 0; i < matches; i ++) { if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) { max_conf = conf; sel = csm[i]; } mean += (conf - mean) / (i + 1); gdouble err = fabs (conf - mean); stddev += (err - stddev) / (i + 1); } if (sel && ((max_conf > 50) || (max_conf - mean > stddev * 1.25))) { return ucsdet_getName (sel, &uc_err); } return NULL; } gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, gchar *in, gsize len, gboolean content_check) { const gchar *real_charset; if (utf_compatible_re == NULL) { utf_compatible_re = rspamd_regexp_new ( "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$", "i", NULL); } if (charset->len == 0 || rspamd_regexp_match (utf_compatible_re, charset->begin, charset->len, TRUE)) { /* * In case of UTF8 charset we still can check the content to find * corner cases */ if (content_check) { real_charset = rspamd_mime_charset_find_by_content (in, MIN (RSPAMD_CHARSET_MAX_CONTENT, len)); if (real_charset) { if (rspamd_regexp_match (utf_compatible_re, real_charset, strlen (real_charset), TRUE)) { RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET); return TRUE; } else { charset->begin = real_charset; charset->len = strlen (real_charset); return FALSE; } } } rspamd_mime_charset_utf_enforce (in, len); return TRUE; } return FALSE; } void rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, struct rspamd_mime_text_part *text_part) { GError *err = NULL; const gchar *charset = NULL; gboolean checked = FALSE, need_charset_heuristic = TRUE; GByteArray *part_content; rspamd_ftok_t charset_tok; struct rspamd_mime_part *part = text_part->mime_part; if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT; } /* Allocate copy storage */ part_content = g_byte_array_sized_new (text_part->parsed.len); memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len); part_content->len = text_part->parsed.len; rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t)g_byte_array_unref, part_content); if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED; } if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) { need_charset_heuristic = FALSE; } if (task->cfg && task->cfg->raw_mode) { SET_PART_RAW (text_part); text_part->utf_raw_content = part_content; return; } if (part->ct->charset.len == 0) { if (need_charset_heuristic) { charset = rspamd_mime_charset_find_by_content (part_content->data, MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len)); if (charset != NULL) { msg_info_task ("detected charset %s", charset); } checked = TRUE; } else { SET_PART_UTF (text_part); text_part->utf_raw_content = part_content; rspamd_mime_text_part_ucs_from_utf (task, text_part); rspamd_mime_text_part_normalise (task, text_part); rspamd_mime_text_part_maybe_renormalise (task, text_part); return; } } else { charset = rspamd_mime_detect_charset (&part->ct->charset, task->task_pool); if (charset == NULL) { charset = rspamd_mime_charset_find_by_content (part_content->data, MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len)); msg_info_task ("detected charset: %s", charset); checked = TRUE; } } if (charset == NULL) { msg_info_task ("<%s>: has invalid charset", task->message_id); SET_PART_RAW (text_part); text_part->utf_raw_content = part_content; return; } RSPAMD_FTOK_FROM_STR (&charset_tok, charset); if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data, part_content->len, !checked)) { SET_PART_UTF (text_part); text_part->utf_raw_content = part_content; rspamd_mime_text_part_ucs_from_utf (task, text_part); rspamd_mime_text_part_normalise (task, text_part); rspamd_mime_text_part_maybe_renormalise (task, text_part); return; } else { charset = charset_tok.begin; if (!rspamd_mime_text_part_utf8_convert (task, text_part, part_content, charset, &err)) { msg_warn_task ("<%s>: cannot convert from %s to utf8: %s", task->message_id, charset, err ? err->message : "unknown problem"); SET_PART_RAW (text_part); g_error_free (err); text_part->utf_raw_content = part_content; return; } } SET_PART_UTF (text_part); } void rspamd_utf_to_unicode (GByteArray *in, GArray *dest) { UErrorCode uc_err = U_ZERO_ERROR; g_array_set_size (dest, in->len + 1); dest->len = ucnv_toUChars (utf8_converter, (UChar *)dest->data, in->len + 1, in->data, in->len, &uc_err); }