/*- * Copyright 2016 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "config.h" #include "libutil/mem_pool.h" #include "libutil/regexp.h" #include "libserver/task.h" #include "message.h" #include #define UTF8_CHARSET "UTF-8" #define RSPAMD_CHARSET_FLAG_UTF (1 << 0) #define RSPAMD_CHARSET_FLAG_ASCII (1 << 1) #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF) #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF) static rspamd_regexp_t *utf_compatible_re = NULL; struct rspamd_charset_substitution { const gchar *input; const gchar *canon; gint flags; }; #include "mime_encoding_list.h" static GHashTable *sub_hash = NULL; static GQuark rspamd_iconv_error_quark (void) { return g_quark_from_static_string ("iconv error"); } static void rspamd_mime_encoding_substitute_init (void) { guint i; sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal); for (i = 0; i < G_N_ELEMENTS (sub); i ++) { g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]); } } static void rspamd_charset_normalize (gchar *in) { /* * This is a simple routine to validate input charset * we just check that charset starts with alphanumeric and ends * with alphanumeric */ gchar *begin, *end; gboolean changed = FALSE; begin = in; while (*begin && !g_ascii_isalnum (*begin)) { begin ++; changed = TRUE; } end = begin + strlen (begin) - 1; while (end > begin && !g_ascii_isalnum (*end)) { end --; changed = TRUE; } if (changed) { memmove (in, begin, end - begin + 2); *(end + 1) = '\0'; } } const gchar * rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool) { gchar *ret = NULL, *h, *t; struct rspamd_charset_substitution *s; if (sub_hash == NULL) { rspamd_mime_encoding_substitute_init (); } ret = rspamd_mempool_ftokdup (pool, in); rspamd_charset_normalize (ret); if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) || (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0 || rspamd_lc_cmp (in->begin, "iso-", 4) == 0) )) { /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */ h = ret; t = ret; while (*h != '\0') { if (*h != '-') { *t++ = *h; } h ++; } *t = '\0'; } s = g_hash_table_lookup (sub_hash, ret); if (s) { return s->canon; } return ret; } gchar * rspamd_text_to_utf8 (rspamd_mempool_t *pool, gchar *input, gsize len, const gchar *in_enc, gsize *olen, GError **err) { gchar *s, *d; gsize outlen; iconv_t ic; rspamd_fstring_t *dst; gsize remain, ret, inremain = len; ic = iconv_open (UTF8_CHARSET, in_enc); if (ic == (iconv_t)-1) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot open iconv for: %s", in_enc); return NULL; } /* Preallocate for half of characters to be converted */ outlen = len + len / 2 + 1; dst = rspamd_fstring_sized_new (outlen); s = input; d = dst->str; remain = outlen - 1; while (inremain > 0 && remain > 0) { ret = iconv (ic, &s, &inremain, &d, &remain); dst->len = d - dst->str; if (ret == (gsize)-1) { switch (errno) { case E2BIG: /* Enlarge string */ if (inremain > 0) { dst = rspamd_fstring_grow (dst, inremain * 2); d = dst->str + dst->len; remain = dst->allocated - dst->len - 1; } break; case EILSEQ: case EINVAL: /* Ignore bad characters */ if (remain > 0 && inremain > 0) { *d++ = '?'; s++; inremain --; remain --; } break; } } else if (ret == 0) { break; } } *d = '\0'; *olen = dst->len; iconv_close (ic); rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)rspamd_fstring_free, dst); msg_info_pool ("converted from %s to UTF-8 inlen: %z, outlen: %z", in_enc, len, dst->len); return dst->str; } GByteArray * rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, struct rspamd_mime_text_part *text_part) { GError *err = NULL; gsize write_bytes; const gchar *charset; gchar *res_str; GByteArray *result_array, *part_content; struct rspamd_mime_part *part = text_part->mime_part; part_content = rspamd_mempool_alloc0 (task->task_pool, sizeof (GByteArray)); part_content->data = (guint8 *)text_part->parsed.begin; part_content->len = text_part->parsed.len; if (task->cfg && task->cfg->raw_mode) { SET_PART_RAW (text_part); return part_content; } if (utf_compatible_re == NULL) { utf_compatible_re = rspamd_regexp_new ( "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi)|(?:us)|(?:ISO-8859-1)|" "(?:latin.*)|(?:CSASCII)$", "i", NULL); } if (part->ct->charset.len == 0) { SET_PART_RAW (text_part); return part_content; } charset = rspamd_mime_detect_charset (&part->ct->charset, task->task_pool); if (charset == NULL) { msg_info_task ("<%s>: has invalid charset", task->message_id); SET_PART_RAW (text_part); return part_content; } if (rspamd_regexp_match (utf_compatible_re, charset, strlen (charset), TRUE)) { if (g_utf8_validate (part_content->data, part_content->len, NULL)) { SET_PART_UTF (text_part); return part_content; } else { msg_info_task ("<%s>: contains invalid utf8 characters, assume it as raw", task->message_id); SET_PART_RAW (text_part); return part_content; } } else { res_str = rspamd_text_to_utf8 (task->task_pool, part_content->data, part_content->len, charset, &write_bytes, &err); if (res_str == NULL) { msg_warn_task ("<%s>: cannot convert from %s to utf8: %s", task->message_id, charset, err ? err->message : "unknown problem"); SET_PART_RAW (text_part); g_error_free (err); return part_content; } } result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray)); result_array->data = res_str; result_array->len = write_bytes; SET_PART_UTF (text_part); return result_array; }