diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-11-19 10:05:14 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-11-19 10:05:14 +0000 |
commit | 7428ea06d530334f715f7e33a3da6ed0183c4967 (patch) | |
tree | 22ac5dd34c1580a6fa3700616f61dbd29c3b1401 | |
parent | bd455692ef8f174210fc48f0048feed392008782 (diff) | |
download | rspamd-7428ea06d530334f715f7e33a3da6ed0183c4967.tar.gz rspamd-7428ea06d530334f715f7e33a3da6ed0183c4967.zip |
[Minor] Unify converters usage
-rw-r--r-- | src/libmime/archives.c | 3 | ||||
-rw-r--r-- | src/libmime/mime_encoding.c | 45 | ||||
-rw-r--r-- | src/libmime/mime_encoding.h | 33 | ||||
-rw-r--r-- | src/libmime/mime_headers.c | 21 |
4 files changed, 80 insertions, 22 deletions
diff --git a/src/libmime/archives.c b/src/libmime/archives.c index 8c7e4ea90..5701ce95c 100644 --- a/src/libmime/archives.c +++ b/src/libmime/archives.c @@ -67,7 +67,8 @@ rspamd_archive_file_try_utf (struct rspamd_task *task, struct rspamd_charset_converter *conv; UConverter *utf8_converter; - conv = rspamd_mime_get_converter_cached (charset, &uc_err); + conv = rspamd_mime_get_converter_cached (charset, task->task_pool, + FALSE, &uc_err); utf8_converter = rspamd_get_utf8_converter (); if (conv == NULL) { diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index 7872fa330..7ef492f9d 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -135,7 +135,10 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, struct rspamd_charset_converter * -rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) +rspamd_mime_get_converter_cached (const gchar *enc, + rspamd_mempool_t *pool, + gboolean is_canon, + UErrorCode *err) { const gchar *canon_name; static rspamd_lru_hash_t *cache; @@ -147,7 +150,19 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) rspamd_str_equal); } - canon_name = ucnv_getStandardName (enc, "IANA", err); + if (enc == NULL) { + return NULL; + } + + if (!is_canon) { + rspamd_ftok_t cset_tok; + + RSPAMD_FTOK_FROM_STR (&cset_tok, enc); + canon_name = rspamd_mime_detect_charset (&cset_tok, pool); + } + else { + canon_name = enc; + } if (canon_name == NULL) { return NULL; @@ -306,7 +321,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, UConverter *utf8_converter; struct rspamd_charset_converter *conv; - conv = rspamd_mime_get_converter_cached (in_enc, &uc_err); + conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err); utf8_converter = rspamd_get_utf8_converter (); if (conv == NULL) { @@ -370,7 +385,8 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, UConverter *utf8_converter; struct rspamd_charset_converter *conv; - conv = rspamd_mime_get_converter_cached (charset, &uc_err); + conv = rspamd_mime_get_converter_cached (charset, task->task_pool, + TRUE, &uc_err); utf8_converter = rspamd_get_utf8_converter (); if (conv == NULL) { @@ -429,6 +445,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in, GByteArray *out, + rspamd_mempool_t *pool, const gchar *enc) { gint32 r, clen, dlen; @@ -438,6 +455,24 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, struct rspamd_charset_converter *conv; rspamd_ftok_t charset_tok; + if (in == NULL || in->len == 0) { + return FALSE; + } + + if (enc == NULL) { + /* Assume utf ? */ + if (rspamd_fast_utf8_validate (in->data, in->len) == 0) { + g_byte_array_set_size (out, in->len); + memcpy (out->data, in->data, out->len); + + return TRUE; + } + else { + /* Bad stuff, keep out */ + return FALSE; + } + } + RSPAMD_FTOK_FROM_STR (&charset_tok, enc); if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len, @@ -449,7 +484,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in, } utf8_converter = rspamd_get_utf8_converter (); - conv = rspamd_mime_get_converter_cached (enc, &uc_err); + conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err); if (conv == NULL) { return FALSE; diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h index 5224d33fb..22f0ee818 100644 --- a/src/libmime/mime_encoding.h +++ b/src/libmime/mime_encoding.h @@ -47,7 +47,7 @@ const gchar *rspamd_mime_detect_charset (const rspamd_ftok_t *in, * @param pool * @param input * @param len - * @param in_enc + * @param in_enc canon charset * @param olen * @param err * @return @@ -57,14 +57,20 @@ gchar *rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, gsize *olen, GError **err); /** - * Converts data from `in` to `out`, returns `FALSE` if `enc` is not a valid iconv charset + * Converts data from `in` to `out`, + * returns `FALSE` if `enc` is not a valid iconv charset + * + * This function, in fact, copies `in` from `out` replacing out content in + * total. * @param in * @param out - * @param enc + * @param enc validated canonical charset name. If NULL, then utf8 check is done only * @return */ gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in, - GByteArray *out, const gchar *enc); + GByteArray *out, + rspamd_mempool_t *pool, + const gchar *enc); /** * Maybe convert part to utf-8 @@ -83,7 +89,8 @@ void rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, * @return */ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, - gchar *in, gsize len, gboolean content_check); + gchar *in, gsize len, + gboolean content_check); /** * Ensure that all characters in string are valid utf8 chars or replace them @@ -93,14 +100,18 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, */ void rspamd_mime_charset_utf_enforce (gchar *in, gsize len); -/** - * Gets cached converter - * @param enc - * @param err - * @return - */ + /** + * Gets cached converter + * @param enc input encoding + * @param pool pool to use for temporary normalisation + * @param is_canon TRUE if normalisation is needed + * @param err output error + * @return converter + */ struct rspamd_charset_converter *rspamd_mime_get_converter_cached ( const gchar *enc, + rspamd_mempool_t *pool, + gboolean is_canon, UErrorCode *err); /** diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index b024bd7b1..e0c91c478 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -512,9 +512,12 @@ rspamd_mime_headers_process (struct rspamd_task *task, } static void -rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out, - GByteArray *token, GByteArray *decoded_token, - rspamd_ftok_t *old_charset, rspamd_ftok_t *new_charset) +rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, + GString *out, + GByteArray *token, + GByteArray *decoded_token, + rspamd_ftok_t *old_charset, + rspamd_ftok_t *new_charset) { if (new_charset->len == 0) { g_assert_not_reached (); @@ -538,14 +541,22 @@ rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out, } /* We need to flush and decode old token to out string */ - if (rspamd_mime_to_utf8_byte_array (token, decoded_token, + if (rspamd_mime_to_utf8_byte_array (token, decoded_token, pool, rspamd_mime_detect_charset (new_charset, pool))) { g_string_append_len (out, decoded_token->data, decoded_token->len); } /* We also reset buffer */ g_byte_array_set_size (token, 0); - /* Propagate charset */ + /* + * Propagate charset + * + * Here are dragons: we save the original charset to allow buffers concat + * in the condition at the beginning of the function. + * However, it will likely cause unnecessary calls for + * `rspamd_mime_detect_charset` which could be relatively expensive. + * But we ignore that for now... + */ memcpy (old_charset, new_charset, sizeof (*old_charset)); } |