diff options
Diffstat (limited to 'src/libmime/mime_encoding.c')
-rw-r--r-- | src/libmime/mime_encoding.c | 533 |
1 files changed, 265 insertions, 268 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index a3467fc91..48a97a4af 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -54,23 +54,22 @@ struct rspamd_charset_substitution { static GHashTable *sub_hash = NULL; static const UChar iso_8859_16_map[] = { - 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, - 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, - 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, - 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, - 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7, - 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B, - 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7, - 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C, - 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7, - 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, - 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A, - 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF, - 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7, - 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, - 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B, - 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF -}; + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F, + 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F, + 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7, + 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B, + 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7, + 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C, + 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A, + 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B, + 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF}; struct rspamd_charset_converter { gchar *canon_name; @@ -82,37 +81,37 @@ struct rspamd_charset_converter { }; static GQuark -rspamd_charset_conv_error_quark (void) +rspamd_charset_conv_error_quark(void) { - return g_quark_from_static_string ("charset conversion error"); + return g_quark_from_static_string("charset conversion error"); } static void -rspamd_converter_dtor (gpointer p) +rspamd_converter_dtor(gpointer p) { - struct rspamd_charset_converter *c = (struct rspamd_charset_converter *)p; + struct rspamd_charset_converter *c = (struct rspamd_charset_converter *) p; if (!c->is_internal) { - ucnv_close (c->d.conv); + ucnv_close(c->d.conv); } - g_free (c->canon_name); - g_free (c); + g_free(c->canon_name); + g_free(c); } int32_t -rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, - UChar *dest, - int32_t destCapacity, - const char *src, - int32_t srcLength, - UErrorCode *pErrorCode) +rspamd_converter_to_uchars(struct rspamd_charset_converter *cnv, + UChar *dest, + int32_t destCapacity, + const char *src, + int32_t srcLength, + UErrorCode *pErrorCode) { if (!cnv->is_internal) { - return ucnv_toUChars (cnv->d.conv, - dest, destCapacity, - src, srcLength, - pErrorCode); + return ucnv_toUChars(cnv->d.conv, + dest, destCapacity, + src, srcLength, + pErrorCode); } else { UChar *d = dest, *dend = dest + destCapacity; @@ -120,13 +119,13 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, while (p < end && d < dend) { if (*p <= 127) { - *d++ = (UChar)*p; + *d++ = (UChar) *p; } else { *d++ = cnv->d.cnv_table[*p - 128]; } - p ++; + p++; } return d - dest; @@ -135,19 +134,19 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, struct rspamd_charset_converter * -rspamd_mime_get_converter_cached (const gchar *enc, - rspamd_mempool_t *pool, - gboolean is_canon, - UErrorCode *err) +rspamd_mime_get_converter_cached(const gchar *enc, + rspamd_mempool_t *pool, + gboolean is_canon, + UErrorCode *err) { const gchar *canon_name; static rspamd_lru_hash_t *cache; struct rspamd_charset_converter *conv; if (cache == NULL) { - cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, NULL, - rspamd_converter_dtor, rspamd_str_hash, - rspamd_str_equal); + cache = rspamd_lru_hash_new_full(RSPAMD_CHARSET_CACHE_SIZE, NULL, + rspamd_converter_dtor, rspamd_str_hash, + rspamd_str_equal); } if (enc == NULL) { @@ -157,8 +156,8 @@ rspamd_mime_get_converter_cached (const gchar *enc, if (!is_canon) { rspamd_ftok_t cset_tok; - RSPAMD_FTOK_FROM_STR (&cset_tok, enc); - canon_name = rspamd_mime_detect_charset (&cset_tok, pool); + RSPAMD_FTOK_FROM_STR(&cset_tok, enc); + canon_name = rspamd_mime_detect_charset(&cset_tok, pool); } else { canon_name = enc; @@ -168,38 +167,38 @@ rspamd_mime_get_converter_cached (const gchar *enc, return NULL; } - conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0); + conv = rspamd_lru_hash_lookup(cache, (gpointer) canon_name, 0); if (conv == NULL) { - if (!(strcmp (canon_name, "ISO-8859-16") == 0 || - strcmp (canon_name, "latin10") == 0 || - strcmp (canon_name, "iso-ir-226") == 0)) { - conv = g_malloc0 (sizeof (*conv)); - conv->d.conv = ucnv_open (canon_name, err); - conv->canon_name = g_strdup (canon_name); + if (!(strcmp(canon_name, "ISO-8859-16") == 0 || + strcmp(canon_name, "latin10") == 0 || + strcmp(canon_name, "iso-ir-226") == 0)) { + conv = g_malloc0(sizeof(*conv)); + conv->d.conv = ucnv_open(canon_name, err); + conv->canon_name = g_strdup(canon_name); if (conv->d.conv != NULL) { - ucnv_setToUCallBack (conv->d.conv, - UCNV_TO_U_CALLBACK_SUBSTITUTE, - NULL, - NULL, - NULL, - err); - rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0); + ucnv_setToUCallBack(conv->d.conv, + UCNV_TO_U_CALLBACK_SUBSTITUTE, + NULL, + NULL, + NULL, + err); + rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0); } else { - g_free (conv); + g_free(conv); conv = NULL; } } else { /* ISO-8859-16 */ - conv = g_malloc0 (sizeof (*conv)); + conv = g_malloc0(sizeof(*conv)); conv->is_internal = TRUE; conv->d.cnv_table = iso_8859_16_map; - conv->canon_name = g_strdup (canon_name); + conv->canon_name = g_strdup(canon_name); - rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0); + rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0); } } @@ -207,19 +206,19 @@ rspamd_mime_get_converter_cached (const gchar *enc, } static void -rspamd_mime_encoding_substitute_init (void) +rspamd_mime_encoding_substitute_init(void) { guint i; - sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal); + sub_hash = g_hash_table_new(rspamd_strcase_hash, rspamd_strcase_equal); - for (i = 0; i < G_N_ELEMENTS (sub); i ++) { - g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]); + for (i = 0; i < G_N_ELEMENTS(sub); i++) { + g_hash_table_insert(sub_hash, (void *) sub[i].input, (void *) &sub[i]); } } static void -rspamd_charset_normalize (gchar *in) +rspamd_charset_normalize(gchar *in) { /* * This is a simple routine to validate input charset @@ -231,26 +230,26 @@ rspamd_charset_normalize (gchar *in) begin = in; - while (*begin && !g_ascii_isalnum (*begin)) { - begin ++; + while (*begin && !g_ascii_isalnum(*begin)) { + begin++; changed = TRUE; } - end = begin + strlen (begin) - 1; + end = begin + strlen(begin) - 1; - while (end > begin && !g_ascii_isalnum (*end)) { - end --; + while (end > begin && !g_ascii_isalnum(*end)) { + end--; changed = TRUE; } if (changed) { - memmove (in, begin, end - begin + 2); + memmove(in, begin, end - begin + 2); *(end + 1) = '\0'; } } const gchar * -rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool) +rspamd_mime_detect_charset(const rspamd_ftok_t *in, rspamd_mempool_t *pool) { gchar *ret = NULL, *h, *t; struct rspamd_charset_substitution *s; @@ -259,27 +258,27 @@ rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool) UErrorCode uc_err = U_ZERO_ERROR; if (sub_hash == NULL) { - rspamd_mime_encoding_substitute_init (); + rspamd_mime_encoding_substitute_init(); } /* Fast path */ - RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf-8"); + RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf-8"); - if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) { + if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) { return UTF8_CHARSET; } - RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf8"); + RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf8"); - if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) { + if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) { return UTF8_CHARSET; } - ret = rspamd_mempool_ftokdup (pool, in); - rspamd_charset_normalize (ret); + ret = rspamd_mempool_ftokdup(pool, in); + rspamd_charset_normalize(ret); - if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) || - (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) { + if ((in->len > 3 && rspamd_lc_cmp(in->begin, "cp-", 3) == 0) || + (in->len > 4 && (rspamd_lc_cmp(in->begin, "ibm-", 4) == 0))) { /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */ h = ret; t = ret; @@ -289,43 +288,43 @@ rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool) *t++ = *h; } - h ++; + h++; } *t = '\0'; } - s = g_hash_table_lookup (sub_hash, ret); + s = g_hash_table_lookup(sub_hash, ret); if (s) { - ret = (char *)s->canon; + ret = (char *) s->canon; } /* Try different aliases */ - cset = ucnv_getCanonicalName (ret, "MIME", &uc_err); + cset = ucnv_getCanonicalName(ret, "MIME", &uc_err); if (cset == NULL) { uc_err = U_ZERO_ERROR; - cset = ucnv_getCanonicalName (ret, "IANA", &uc_err); + cset = ucnv_getCanonicalName(ret, "IANA", &uc_err); } if (cset == NULL) { uc_err = U_ZERO_ERROR; - cset = ucnv_getCanonicalName (ret, "", &uc_err); + cset = ucnv_getCanonicalName(ret, "", &uc_err); } if (cset == NULL) { uc_err = U_ZERO_ERROR; - cset = ucnv_getAlias (ret, 0, &uc_err); + cset = ucnv_getAlias(ret, 0, &uc_err); } return cset; } gchar * -rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, - gchar *input, gsize len, const gchar *in_enc, - gsize *olen, GError **err) +rspamd_mime_text_to_utf8(rspamd_mempool_t *pool, + gchar *input, gsize len, const gchar *in_enc, + gsize *olen, GError **err) { gchar *d; gint32 r, clen, dlen; @@ -337,12 +336,12 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, rspamd_ftok_t cset_tok; /* Check if already utf8 */ - RSPAMD_FTOK_FROM_STR (&cset_tok, in_enc); + RSPAMD_FTOK_FROM_STR(&cset_tok, in_enc); - if (rspamd_mime_charset_utf_check (&cset_tok, input, len, - FALSE)) { - d = rspamd_mempool_alloc (pool, len); - memcpy (d, input, len); + if (rspamd_mime_charset_utf_check(&cset_tok, input, len, + FALSE)) { + d = rspamd_mempool_alloc(pool, len); + memcpy(d, input, len); if (olen) { *olen = len; } @@ -350,48 +349,48 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, return d; } - conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err); - utf8_converter = rspamd_get_utf8_converter (); + conv = rspamd_mime_get_converter_cached(in_enc, pool, TRUE, &uc_err); + utf8_converter = rspamd_get_utf8_converter(); if (conv == NULL) { - g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL, - "cannot open converter for %s: %s", - in_enc, u_errorName (uc_err)); + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot open converter for %s: %s", + in_enc, u_errorName(uc_err)); return NULL; } - tmp_buf = g_new (UChar, len + 1); + tmp_buf = g_new(UChar, len + 1); uc_err = U_ZERO_ERROR; - r = rspamd_converter_to_uchars (conv, tmp_buf, len + 1, input, len, &uc_err); + r = rspamd_converter_to_uchars(conv, tmp_buf, len + 1, input, len, &uc_err); - if (!U_SUCCESS (uc_err)) { - g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL, + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, "cannot convert data to unicode from %s: %s", - in_enc, u_errorName (uc_err)); - g_free (tmp_buf); + in_enc, u_errorName(uc_err)); + g_free(tmp_buf); return NULL; } /* Now, convert to utf8 */ - clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); - d = rspamd_mempool_alloc (pool, dlen); - r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err); + clen = ucnv_getMaxCharSize(utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen); + d = rspamd_mempool_alloc(pool, dlen); + r = ucnv_fromUChars(utf8_converter, d, dlen, tmp_buf, r, &uc_err); - if (!U_SUCCESS (uc_err)) { - g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL, - "cannot convert data from unicode from %s: %s", - in_enc, u_errorName (uc_err)); - g_free (tmp_buf); + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot convert data from unicode from %s: %s", + in_enc, u_errorName(uc_err)); + g_free(tmp_buf); return NULL; } - msg_debug_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d", - in_enc, len, r); - g_free (tmp_buf); + msg_debug_pool("converted from %s to UTF-8 inlen: %z, outlen: %d", + in_enc, len, r); + g_free(tmp_buf); if (olen) { *olen = r; @@ -401,11 +400,11 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, } static gboolean -rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, - struct rspamd_mime_text_part *text_part, - GByteArray *input, - const gchar *charset, - GError **err) +rspamd_mime_text_part_utf8_convert(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part, + GByteArray *input, + const gchar *charset, + GError **err) { gchar *d; gint32 r, clen, dlen, uc_len; @@ -414,76 +413,76 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, UConverter *utf8_converter; struct rspamd_charset_converter *conv; - conv = rspamd_mime_get_converter_cached (charset, task->task_pool, - TRUE, &uc_err); - utf8_converter = rspamd_get_utf8_converter (); + conv = rspamd_mime_get_converter_cached(charset, task->task_pool, + TRUE, &uc_err); + utf8_converter = rspamd_get_utf8_converter(); if (conv == NULL) { - g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL, - "cannot open converter for %s: %s", - charset, u_errorName (uc_err)); + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot open converter for %s: %s", + charset, u_errorName(uc_err)); return FALSE; } - tmp_buf = g_new (UChar, input->len + 1); + tmp_buf = g_new(UChar, input->len + 1); uc_err = U_ZERO_ERROR; - uc_len = rspamd_converter_to_uchars (conv, - tmp_buf, - input->len + 1, - input->data, - input->len, - &uc_err); - - if (!U_SUCCESS (uc_err)) { - g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL, - "cannot convert data to unicode from %s: %s", - charset, u_errorName (uc_err)); - g_free (tmp_buf); + uc_len = rspamd_converter_to_uchars(conv, + tmp_buf, + input->len + 1, + input->data, + input->len, + &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot convert data to unicode from %s: %s", + charset, u_errorName(uc_err)); + g_free(tmp_buf); return FALSE; } /* Now, convert to utf8 */ - clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen); - d = rspamd_mempool_alloc (task->task_pool, dlen); - r = ucnv_fromUChars (utf8_converter, d, dlen, - tmp_buf, uc_len, &uc_err); - - if (!U_SUCCESS (uc_err)) { - g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL, - "cannot convert data from unicode from %s: %s", - charset, u_errorName (uc_err)); - g_free (tmp_buf); + clen = ucnv_getMaxCharSize(utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING(uc_len, clen); + d = rspamd_mempool_alloc(task->task_pool, dlen); + r = ucnv_fromUChars(utf8_converter, d, dlen, + tmp_buf, uc_len, &uc_err); + + if (!U_SUCCESS(uc_err)) { + g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL, + "cannot convert data from unicode from %s: %s", + charset, u_errorName(uc_err)); + g_free(tmp_buf); return FALSE; } if (text_part->mime_part && text_part->mime_part->ct) { - msg_info_task ("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)", - charset, &text_part->mime_part->ct->charset, input->len, r, uc_len); + msg_info_task("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)", + charset, &text_part->mime_part->ct->charset, input->len, r, uc_len); } else { - msg_info_task ("converted text part from %s (no charset announced) to UTF-8 inlen: %d, " - "outlen: %d (%d UTF16 chars)", - charset, input->len, r, uc_len); + msg_info_task("converted text part from %s (no charset announced) to UTF-8 inlen: %d, " + "outlen: %d (%d UTF16 chars)", + charset, input->len, r, uc_len); } - text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool, - sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4); + text_part->utf_raw_content = rspamd_mempool_alloc(task->task_pool, + sizeof(*text_part->utf_raw_content) + sizeof(gpointer) * 4); text_part->utf_raw_content->data = d; text_part->utf_raw_content->len = r; - g_free (tmp_buf); + g_free(tmp_buf); return TRUE; } gboolean -rspamd_mime_to_utf8_byte_array (GByteArray *in, - GByteArray *out, - rspamd_mempool_t *pool, - const gchar *enc) +rspamd_mime_to_utf8_byte_array(GByteArray *in, + GByteArray *out, + rspamd_mempool_t *pool, + const gchar *enc) { gint32 r, clen, dlen; UChar *tmp_buf; @@ -498,9 +497,9 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, if (enc == NULL) { /* Assume utf ? */ - if (rspamd_fast_utf8_validate (in->data, in->len) == 0) { - g_byte_array_set_size (out, in->len); - memcpy (out->data, in->data, out->len); + if (rspamd_fast_utf8_validate(in->data, in->len) == 0) { + g_byte_array_set_size(out, in->len); + memcpy(out->data, in->data, out->len); return TRUE; } @@ -510,55 +509,54 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, } } - RSPAMD_FTOK_FROM_STR (&charset_tok, enc); + RSPAMD_FTOK_FROM_STR(&charset_tok, enc); - if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len, - FALSE)) { - g_byte_array_set_size (out, in->len); - memcpy (out->data, in->data, out->len); + if (rspamd_mime_charset_utf_check(&charset_tok, (gchar *) in->data, in->len, + FALSE)) { + g_byte_array_set_size(out, in->len); + memcpy(out->data, in->data, out->len); return TRUE; } - utf8_converter = rspamd_get_utf8_converter (); - conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err); + utf8_converter = rspamd_get_utf8_converter(); + conv = rspamd_mime_get_converter_cached(enc, pool, TRUE, &uc_err); if (conv == NULL) { return FALSE; } - tmp_buf = g_new (UChar, in->len + 1); + tmp_buf = g_new(UChar, in->len + 1); uc_err = U_ZERO_ERROR; - r = rspamd_converter_to_uchars (conv, - tmp_buf, in->len + 1, - in->data, in->len, &uc_err); + r = rspamd_converter_to_uchars(conv, + tmp_buf, in->len + 1, + in->data, in->len, &uc_err); - if (!U_SUCCESS (uc_err)) { - g_free (tmp_buf); + if (!U_SUCCESS(uc_err)) { + g_free(tmp_buf); return FALSE; } /* Now, convert to utf8 */ - clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); - g_byte_array_set_size (out, dlen); - r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err); + clen = ucnv_getMaxCharSize(utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen); + g_byte_array_set_size(out, dlen); + r = ucnv_fromUChars(utf8_converter, out->data, dlen, tmp_buf, r, &uc_err); - if (!U_SUCCESS (uc_err)) { - g_free (tmp_buf); + if (!U_SUCCESS(uc_err)) { + g_free(tmp_buf); return FALSE; } - g_free (tmp_buf); + g_free(tmp_buf); out->len = r; return TRUE; } -void -rspamd_mime_charset_utf_enforce (gchar *in, gsize len) +void rspamd_mime_charset_utf_enforce(gchar *in, gsize len) { gchar *p, *end; goffset err_offset; @@ -568,25 +566,25 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len) p = in; end = in + len; - while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len)) > 0) { - err_offset --; /* As it returns it 1 indexed */ + while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate(p, len)) > 0) { + err_offset--; /* As it returns it 1 indexed */ gint32 cur_offset = err_offset; while (cur_offset < len) { gint32 tmp = cur_offset; - U8_NEXT (p, cur_offset, len, uc); + U8_NEXT(p, cur_offset, len, uc); if (uc > 0) { /* Fill string between err_offset and tmp with `?` character */ - memset (p + err_offset, '?', tmp - err_offset); + memset(p + err_offset, '?', tmp - err_offset); break; } } if (uc < 0) { /* Fill till the end */ - memset (p + err_offset, '?', len - err_offset); + memset(p + err_offset, '?', len - err_offset); break; } @@ -596,23 +594,23 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len) } const char * -rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen, - bool check_utf8) +rspamd_mime_charset_find_by_content(const gchar *in, gsize inlen, + bool check_utf8) { int nconsumed; bool is_reliable; const gchar *ced_name; if (check_utf8) { - if (rspamd_fast_utf8_validate (in, inlen) == 0) { + if (rspamd_fast_utf8_validate(in, inlen) == 0) { return UTF8_CHARSET; } } - ced_name = ced_encoding_detect (in, inlen, NULL, NULL, - NULL, 0, CED_EMAIL_CORPUS, - false, &nconsumed, &is_reliable); + ced_name = ced_encoding_detect(in, inlen, NULL, NULL, + NULL, 0, CED_EMAIL_CORPUS, + false, &nconsumed, &is_reliable); if (ced_name) { @@ -623,28 +621,28 @@ rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen, } static const char * -rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen) +rspamd_mime_charset_find_by_content_maybe_split(const gchar *in, gsize inlen) { if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) { - return rspamd_mime_charset_find_by_content (in, inlen, false); + return rspamd_mime_charset_find_by_content(in, inlen, false); } else { const gchar *c1, *c2, *c3; - c1 = rspamd_mime_charset_find_by_content (in, RSPAMD_CHARSET_MAX_CONTENT, false); - c2 = rspamd_mime_charset_find_by_content (in + inlen / 2, - RSPAMD_CHARSET_MAX_CONTENT, false); - c3 = rspamd_mime_charset_find_by_content (in + inlen - RSPAMD_CHARSET_MAX_CONTENT, - RSPAMD_CHARSET_MAX_CONTENT, false); + c1 = rspamd_mime_charset_find_by_content(in, RSPAMD_CHARSET_MAX_CONTENT, false); + c2 = rspamd_mime_charset_find_by_content(in + inlen / 2, + RSPAMD_CHARSET_MAX_CONTENT, false); + c3 = rspamd_mime_charset_find_by_content(in + inlen - RSPAMD_CHARSET_MAX_CONTENT, + RSPAMD_CHARSET_MAX_CONTENT, false); /* 7bit stuff */ - if (c1 && strcmp (c1, "US-ASCII") == 0) { + if (c1 && strcmp(c1, "US-ASCII") == 0) { c1 = NULL; /* Invalid - we have 8 bit there */ } - if (c2 && strcmp (c2, "US-ASCII") == 0) { + if (c2 && strcmp(c2, "US-ASCII") == 0) { c2 = NULL; /* Invalid - we have 8 bit there */ } - if (c3 && strcmp (c3, "US-ASCII") == 0) { + if (c3 && strcmp(c3, "US-ASCII") == 0) { c3 = NULL; /* Invalid - we have 8 bit there */ } @@ -679,45 +677,45 @@ rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen) } gboolean -rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, - gchar *in, gsize len, gboolean content_check) +rspamd_mime_charset_utf_check(rspamd_ftok_t *charset, + gchar *in, gsize len, gboolean content_check) { const gchar *real_charset; if (utf_compatible_re == NULL) { - utf_compatible_re = rspamd_regexp_new ( - "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$", - "i", NULL); + utf_compatible_re = rspamd_regexp_new( + "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$", + "i", NULL); } if (charset->len == 0 || - rspamd_regexp_match (utf_compatible_re, - charset->begin, charset->len, TRUE)) { + rspamd_regexp_match(utf_compatible_re, + charset->begin, charset->len, TRUE)) { /* * In case of UTF8 charset we still can check the content to find * corner cases */ if (content_check) { - if (rspamd_fast_utf8_validate (in, len) != 0) { + if (rspamd_fast_utf8_validate(in, len) != 0) { real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len); if (real_charset) { - if (rspamd_regexp_match (utf_compatible_re, - real_charset, strlen (real_charset), TRUE)) { - RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET); + if (rspamd_regexp_match(utf_compatible_re, + real_charset, strlen(real_charset), TRUE)) { + RSPAMD_FTOK_ASSIGN(charset, UTF8_CHARSET); return TRUE; } else { charset->begin = real_charset; - charset->len = strlen (real_charset); + charset->len = strlen(real_charset); return FALSE; } } - rspamd_mime_charset_utf_enforce (in, len); + rspamd_mime_charset_utf_enforce(in, len); } } @@ -727,9 +725,8 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, return FALSE; } -void -rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, - struct rspamd_mime_text_part *text_part) +void rspamd_mime_text_part_maybe_convert(struct rspamd_task *task, + struct rspamd_mime_text_part *text_part) { GError *err = NULL; const gchar *charset = NULL; @@ -738,21 +735,21 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, rspamd_ftok_t charset_tok; struct rspamd_mime_part *part = text_part->mime_part; - if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) { + if (rspamd_str_has_8bit(text_part->raw.begin, text_part->raw.len)) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW; } /* Allocate copy storage */ - part_content = g_byte_array_sized_new (text_part->parsed.len); - memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len); + part_content = g_byte_array_sized_new(text_part->parsed.len); + memcpy(part_content->data, text_part->parsed.begin, text_part->parsed.len); part_content->len = text_part->parsed.len; - rspamd_mempool_notify_alloc (task->task_pool, - part_content->len); - rspamd_mempool_add_destructor (task->task_pool, - (rspamd_mempool_destruct_t)g_byte_array_unref, part_content); + rspamd_mempool_notify_alloc(task->task_pool, + part_content->len); + rspamd_mempool_add_destructor(task->task_pool, + (rspamd_mempool_destruct_t) g_byte_array_unref, part_content); - if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) { - if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) { + if (rspamd_str_has_8bit(text_part->parsed.begin, text_part->parsed.len)) { + if (rspamd_fast_utf8_validate(text_part->parsed.begin, text_part->parsed.len) == 0) { /* Valid UTF, likely all good */ need_charset_heuristic = FALSE; valid_utf8 = TRUE; @@ -770,18 +767,18 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, if (part->ct->charset.len == 0) { if (need_charset_heuristic) { - charset = rspamd_mime_charset_find_by_content_maybe_split (text_part->parsed.begin, - text_part->parsed.len); + charset = rspamd_mime_charset_find_by_content_maybe_split(text_part->parsed.begin, + text_part->parsed.len); if (charset != NULL) { - msg_info_task ("detected charset %s", charset); + msg_info_task("detected charset %s", charset); } checked = TRUE; text_part->real_charset = charset; } else if (valid_utf8) { - SET_PART_UTF (text_part); + SET_PART_UTF(text_part); text_part->utf_raw_content = part_content; text_part->real_charset = UTF8_CHARSET; @@ -789,15 +786,15 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, } } else { - charset = rspamd_mime_detect_charset (&part->ct->charset, - task->task_pool); + charset = rspamd_mime_detect_charset(&part->ct->charset, + task->task_pool); if (charset == NULL) { /* We don't know the real charset but can try heuristic */ if (need_charset_heuristic) { - charset = rspamd_mime_charset_find_by_content_maybe_split (part_content->data, - part_content->len); - msg_info_task ("detected charset: %s", charset); + charset = rspamd_mime_charset_find_by_content_maybe_split(part_content->data, + part_content->len); + msg_info_task("detected charset: %s", charset); checked = TRUE; text_part->real_charset = charset; } @@ -809,7 +806,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, else { text_part->real_charset = charset; - if (strcmp (charset, UTF8_CHARSET) != 0) { + if (strcmp(charset, UTF8_CHARSET) != 0) { /* * We have detected some charset, but we don't know which one, * so we need to reset valid utf8 flag and enforce it later @@ -820,21 +817,21 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, } if (text_part->real_charset == NULL) { - msg_info_task ("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"", - MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset, - part->ct->cpy); - SET_PART_RAW (text_part); + msg_info_task("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"", + MESSAGE_FIELD_CHECK(task, message_id), &part->ct->charset, + part->ct->cpy); + SET_PART_RAW(text_part); text_part->utf_raw_content = part_content; return; } - RSPAMD_FTOK_FROM_STR (&charset_tok, charset); + RSPAMD_FTOK_FROM_STR(&charset_tok, charset); if (!valid_utf8) { - if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data, - part_content->len, !checked)) { - SET_PART_UTF (text_part); + if (rspamd_mime_charset_utf_check(&charset_tok, part_content->data, + part_content->len, !checked)) { + SET_PART_UTF(text_part); text_part->utf_raw_content = part_content; text_part->real_charset = UTF8_CHARSET; @@ -843,25 +840,25 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, else { charset = charset_tok.begin; - if (!rspamd_mime_text_part_utf8_convert (task, text_part, - part_content, charset, &err)) { - msg_warn_task ("<%s>: cannot convert from %s to utf8: %s", - MESSAGE_FIELD (task, message_id), - charset, - err ? err->message : "unknown problem"); - SET_PART_RAW (text_part); - g_error_free (err); + if (!rspamd_mime_text_part_utf8_convert(task, text_part, + part_content, charset, &err)) { + msg_warn_task("<%s>: cannot convert from %s to utf8: %s", + MESSAGE_FIELD(task, message_id), + charset, + err ? err->message : "unknown problem"); + SET_PART_RAW(text_part); + g_error_free(err); text_part->utf_raw_content = part_content; return; } - SET_PART_UTF (text_part); + SET_PART_UTF(text_part); text_part->real_charset = charset; } } else { - SET_PART_UTF (text_part); + SET_PART_UTF(text_part); text_part->utf_raw_content = part_content; } } |