From: Vsevolod Stakhov Date: Mon, 19 Dec 2016 16:49:46 +0000 (+0000) Subject: [Minor] Add routine to recode byte arrays to utf-8 X-Git-Tag: 1.5.0~551 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=41f091a7b2aaae65272306ee5c356a39622563f5;p=rspamd.git [Minor] Add routine to recode byte arrays to utf-8 --- diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index 4332ab379..47b490b73 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -132,7 +132,7 @@ rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool) } gchar * -rspamd_text_to_utf8 (rspamd_mempool_t *pool, +rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, gchar *input, gsize len, const gchar *in_enc, gsize *olen, GError **err) { @@ -200,6 +200,69 @@ rspamd_text_to_utf8 (rspamd_mempool_t *pool, return dst->str; } +gboolean +rspamd_mime_to_utf8_byte_array (GByteArray *in, + GByteArray *out, + const gchar *enc) +{ + guchar *s, *d; + gsize outlen, pos; + iconv_t ic; + gsize remain, ret, inremain = in->len; + + ic = iconv_open (UTF8_CHARSET, enc); + + if (ic == (iconv_t)-1) { + return FALSE; + } + + /* Preallocate for half of characters to be converted */ + outlen = inremain + inremain / 2 + 1; + g_byte_array_set_size (out, outlen); + s = in->data; + d = out->data; + remain = outlen; + + while (inremain > 0 && remain > 0) { + ret = iconv (ic, (gchar **)&s, &inremain, (gchar **)&d, &remain); + out->len = d - out->data; + + if (ret == (gsize)-1) { + switch (errno) { + case E2BIG: + /* Enlarge string */ + if (inremain > 0) { + pos = outlen; + outlen += inremain * 2; + /* May cause reallocate, so store previous len in pos */ + g_byte_array_set_size (out, outlen); + d = out->data + pos; + remain = outlen - pos; + } + break; + case EILSEQ: + case EINVAL: + /* Ignore bad characters */ + if (remain > 0 && inremain > 0) { + *d++ = '?'; + s++; + inremain --; + remain --; + } + break; + } + } + else if (ret == 0) { + break; + } + } + + out->len = d - out->data; + iconv_close (ic); + + return TRUE; +} + gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, gchar *in, gsize len) { @@ -287,7 +350,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, return part_content; } else { - res_str = rspamd_text_to_utf8 (task->task_pool, part_content->data, + res_str = rspamd_mime_text_to_utf8 (task->task_pool, part_content->data, part_content->len, charset, &write_bytes, diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h index 9c0975406..eb3a59942 100644 --- a/src/libmime/mime_encoding.h +++ b/src/libmime/mime_encoding.h @@ -49,6 +49,16 @@ gchar * rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, gchar *input, gsize len, const gchar *in_enc, gsize *olen, GError **err); +/** + * Converts data from `in` to `out`, returns `FALSE` if `enc` is not a valid iconv charset + * @param in + * @param out + * @param enc + * @return + */ +gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in, + GByteArray *out, const gchar *enc); + /** * Maybe convert part to utf-8 * @param task @@ -58,5 +68,14 @@ gchar * rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, GByteArray * rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, struct rspamd_mime_text_part *text_part); +/** + * Checks utf8 charset and normalize/validate utf8 string + * @param charset + * @param in + * @param len + * @return + */ +gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, + gchar *in, gsize len); #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */