@@ -67,7 +67,8 @@ rspamd_archive_file_try_utf (struct rspamd_task *task, | |||
struct rspamd_charset_converter *conv; | |||
UConverter *utf8_converter; | |||
conv = rspamd_mime_get_converter_cached (charset, &uc_err); | |||
conv = rspamd_mime_get_converter_cached (charset, task->task_pool, | |||
FALSE, &uc_err); | |||
utf8_converter = rspamd_get_utf8_converter (); | |||
if (conv == NULL) { |
@@ -135,7 +135,10 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, | |||
struct rspamd_charset_converter * | |||
rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) | |||
rspamd_mime_get_converter_cached (const gchar *enc, | |||
rspamd_mempool_t *pool, | |||
gboolean is_canon, | |||
UErrorCode *err) | |||
{ | |||
const gchar *canon_name; | |||
static rspamd_lru_hash_t *cache; | |||
@@ -147,7 +150,19 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) | |||
rspamd_str_equal); | |||
} | |||
canon_name = ucnv_getStandardName (enc, "IANA", err); | |||
if (enc == NULL) { | |||
return NULL; | |||
} | |||
if (!is_canon) { | |||
rspamd_ftok_t cset_tok; | |||
RSPAMD_FTOK_FROM_STR (&cset_tok, enc); | |||
canon_name = rspamd_mime_detect_charset (&cset_tok, pool); | |||
} | |||
else { | |||
canon_name = enc; | |||
} | |||
if (canon_name == NULL) { | |||
return NULL; | |||
@@ -306,7 +321,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, | |||
UConverter *utf8_converter; | |||
struct rspamd_charset_converter *conv; | |||
conv = rspamd_mime_get_converter_cached (in_enc, &uc_err); | |||
conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err); | |||
utf8_converter = rspamd_get_utf8_converter (); | |||
if (conv == NULL) { | |||
@@ -370,7 +385,8 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, | |||
UConverter *utf8_converter; | |||
struct rspamd_charset_converter *conv; | |||
conv = rspamd_mime_get_converter_cached (charset, &uc_err); | |||
conv = rspamd_mime_get_converter_cached (charset, task->task_pool, | |||
TRUE, &uc_err); | |||
utf8_converter = rspamd_get_utf8_converter (); | |||
if (conv == NULL) { | |||
@@ -429,6 +445,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, | |||
gboolean | |||
rspamd_mime_to_utf8_byte_array (GByteArray *in, | |||
GByteArray *out, | |||
rspamd_mempool_t *pool, | |||
const gchar *enc) | |||
{ | |||
gint32 r, clen, dlen; | |||
@@ -438,6 +455,24 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, | |||
struct rspamd_charset_converter *conv; | |||
rspamd_ftok_t charset_tok; | |||
if (in == NULL || in->len == 0) { | |||
return FALSE; | |||
} | |||
if (enc == NULL) { | |||
/* Assume utf ? */ | |||
if (rspamd_fast_utf8_validate (in->data, in->len) == 0) { | |||
g_byte_array_set_size (out, in->len); | |||
memcpy (out->data, in->data, out->len); | |||
return TRUE; | |||
} | |||
else { | |||
/* Bad stuff, keep out */ | |||
return FALSE; | |||
} | |||
} | |||
RSPAMD_FTOK_FROM_STR (&charset_tok, enc); | |||
if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len, | |||
@@ -449,7 +484,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, | |||
} | |||
utf8_converter = rspamd_get_utf8_converter (); | |||
conv = rspamd_mime_get_converter_cached (enc, &uc_err); | |||
conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err); | |||
if (conv == NULL) { | |||
return FALSE; |
@@ -47,7 +47,7 @@ const gchar *rspamd_mime_detect_charset (const rspamd_ftok_t *in, | |||
* @param pool | |||
* @param input | |||
* @param len | |||
* @param in_enc | |||
* @param in_enc canon charset | |||
* @param olen | |||
* @param err | |||
* @return | |||
@@ -57,14 +57,20 @@ gchar *rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, | |||
gsize *olen, GError **err); | |||
/** | |||
* Converts data from `in` to `out`, returns `FALSE` if `enc` is not a valid iconv charset | |||
* Converts data from `in` to `out`, | |||
* returns `FALSE` if `enc` is not a valid iconv charset | |||
* | |||
* This function, in fact, copies `in` from `out` replacing out content in | |||
* total. | |||
* @param in | |||
* @param out | |||
* @param enc | |||
* @param enc validated canonical charset name. If NULL, then utf8 check is done only | |||
* @return | |||
*/ | |||
gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in, | |||
GByteArray *out, const gchar *enc); | |||
GByteArray *out, | |||
rspamd_mempool_t *pool, | |||
const gchar *enc); | |||
/** | |||
* Maybe convert part to utf-8 | |||
@@ -83,7 +89,8 @@ void rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, | |||
* @return | |||
*/ | |||
gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, | |||
gchar *in, gsize len, gboolean content_check); | |||
gchar *in, gsize len, | |||
gboolean content_check); | |||
/** | |||
* Ensure that all characters in string are valid utf8 chars or replace them | |||
@@ -93,14 +100,18 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, | |||
*/ | |||
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len); | |||
/** | |||
* Gets cached converter | |||
* @param enc | |||
* @param err | |||
* @return | |||
*/ | |||
/** | |||
* Gets cached converter | |||
* @param enc input encoding | |||
* @param pool pool to use for temporary normalisation | |||
* @param is_canon TRUE if normalisation is needed | |||
* @param err output error | |||
* @return converter | |||
*/ | |||
struct rspamd_charset_converter *rspamd_mime_get_converter_cached ( | |||
const gchar *enc, | |||
rspamd_mempool_t *pool, | |||
gboolean is_canon, | |||
UErrorCode *err); | |||
/** |
@@ -512,9 +512,12 @@ rspamd_mime_headers_process (struct rspamd_task *task, | |||
} | |||
static void | |||
rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out, | |||
GByteArray *token, GByteArray *decoded_token, | |||
rspamd_ftok_t *old_charset, rspamd_ftok_t *new_charset) | |||
rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, | |||
GString *out, | |||
GByteArray *token, | |||
GByteArray *decoded_token, | |||
rspamd_ftok_t *old_charset, | |||
rspamd_ftok_t *new_charset) | |||
{ | |||
if (new_charset->len == 0) { | |||
g_assert_not_reached (); | |||
@@ -538,14 +541,22 @@ rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out, | |||
} | |||
/* We need to flush and decode old token to out string */ | |||
if (rspamd_mime_to_utf8_byte_array (token, decoded_token, | |||
if (rspamd_mime_to_utf8_byte_array (token, decoded_token, pool, | |||
rspamd_mime_detect_charset (new_charset, pool))) { | |||
g_string_append_len (out, decoded_token->data, decoded_token->len); | |||
} | |||
/* We also reset buffer */ | |||
g_byte_array_set_size (token, 0); | |||
/* Propagate charset */ | |||
/* | |||
* Propagate charset | |||
* | |||
* Here are dragons: we save the original charset to allow buffers concat | |||
* in the condition at the beginning of the function. | |||
* However, it will likely cause unnecessary calls for | |||
* `rspamd_mime_detect_charset` which could be relatively expensive. | |||
* But we ignore that for now... | |||
*/ | |||
memcpy (old_charset, new_charset, sizeof (*old_charset)); | |||
} | |||