From ec439ea252e82a6500bbc589d09599ce8db46fdf Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 28 Dec 2018 07:53:42 +0000 Subject: [PATCH] [Feature] Core: Detect charset in archived files --- src/libmime/archives.c | 102 ++++++++++++++++++++++++++++++++---- src/libmime/mime_encoding.c | 10 ++-- src/libmime/mime_encoding.h | 38 ++++++++++++++ 3 files changed, 134 insertions(+), 16 deletions(-) diff --git a/src/libmime/archives.c b/src/libmime/archives.c index 183232e6f..8497fdf70 100644 --- a/src/libmime/archives.c +++ b/src/libmime/archives.c @@ -18,9 +18,12 @@ #include "message.h" #include "task.h" #include "archives.h" +#include "libmime/mime_encoding.h" #include #include #include +#include + static void rspamd_archive_dtor (gpointer p) @@ -42,6 +45,79 @@ rspamd_archive_dtor (gpointer p) g_ptr_array_free (arch->files, TRUE); } +static GString * +rspamd_archive_file_try_utf (const gchar *in, gsize inlen) +{ + const gchar *charset = NULL, *p, *end; + GString *res; + + charset = rspamd_mime_charset_find_by_content (in, inlen); + + if (charset) { + UChar *tmp; + UErrorCode uc_err = U_ZERO_ERROR; + gint32 r, clen, dlen; + struct rspamd_charset_converter *conv; + UConverter *utf8_converter; + + conv = rspamd_mime_get_converter_cached (charset, &uc_err); + utf8_converter = rspamd_get_utf8_converter (); + + if (conv == NULL) { + msg_err ("cannot open converter for %s: %s", + charset, u_errorName (uc_err)); + + return NULL; + } + + tmp = g_malloc (sizeof (*tmp) * (inlen + 1)); + r = rspamd_converter_to_uchars (conv, tmp, inlen + 1, + in, inlen, &uc_err); + if (!U_SUCCESS (uc_err)) { + msg_err ("cannot convert data to unicode from %s: %s", + charset, u_errorName (uc_err)); + g_free (tmp); + + return NULL; + } + + clen = ucnv_getMaxCharSize (utf8_converter); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); + res = g_string_sized_new (dlen); + r = ucnv_fromUChars (utf8_converter, res->str, dlen, tmp, r, &uc_err); + + if (!U_SUCCESS (uc_err)) { + msg_err ("cannot convert data from unicode from %s: %s", + charset, u_errorName (uc_err)); + g_free (tmp); + g_string_free (res, TRUE); + + return NULL; + } + + res->len = r; + } + else { + /* Convert unsafe characters to '?' */ + res = g_string_sized_new (inlen); + p = in; + end = in + inlen; + + while (p < end) { + if (g_ascii_isgraph (*p)) { + g_string_append_c (res, *p); + } + else { + g_string_append_c (res, '?'); + } + + p ++; + } + } + + return res; +} + static void rspamd_archive_process_zip (struct rspamd_task *task, struct rspamd_mime_part *part) @@ -147,11 +223,17 @@ rspamd_archive_process_zip (struct rspamd_task *task, } f = g_malloc0 (sizeof (*f)); - f->fname = g_string_new_len (cd + cd_basic_len, fname_len); + f->fname = rspamd_archive_file_try_utf (cd + cd_basic_len, fname_len); f->compressed_size = comp_size; f->uncompressed_size = uncomp_size; - g_ptr_array_add (arch->files, f); - msg_debug_task ("found file in zip archive: %v", f->fname); + + if (f->fname) { + g_ptr_array_add (arch->files, f); + msg_debug_task ("found file in zip archive: %v", f->fname); + } + else { + g_free (f); + } cd += fname_len + comment_len + extra_len + cd_basic_len; } @@ -1227,7 +1309,10 @@ rspamd_7zip_ucs2_to_utf8 (struct rspamd_task *task, const guchar *p, while (src_pos < len) { U16_NEXT (up, src_pos, len, wc); - U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error); + + if (wc > 0) { + U8_APPEND (res->str, dest_pos, res->allocated_len, wc, is_error); + } if (is_error) { g_string_free (res, TRUE); @@ -1519,19 +1604,14 @@ rspamd_archive_process_gzip (struct rspamd_task *task, struct rspamd_archive_file *f; f = g_malloc0 (sizeof (*f)); - f->fname = g_string_new (fname_start); + f->fname = rspamd_archive_file_try_utf (fname_start, + p - fname_start); g_ptr_array_add (arch->files, f); goto set; } } - else if (!g_ascii_isgraph (*p)) { - msg_debug_task ("gzip archive is invalid, bad filename at pos %d", - (int)(p - start)); - - return; - } p ++; } diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index d7ac5d416..213817747 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -98,7 +98,7 @@ rspamd_converter_dtor (gpointer p) g_free (c); } -static int32_t +int32_t rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, UChar *dest, int32_t destCapacity, @@ -132,7 +132,7 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, } -static struct rspamd_charset_converter * +struct rspamd_charset_converter * rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err) { const gchar *canon_name; @@ -497,8 +497,8 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len) } } -static const char * -rspamd_mime_charset_find_by_content (gchar *in, gsize inlen) +const char * +rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen) { static UCharsetDetector *csd; const UCharsetMatch **csm, *sel = NULL; @@ -524,7 +524,7 @@ rspamd_mime_charset_find_by_content (gchar *in, gsize inlen) detect: ucsdet_setText (csd, in, inlen, &uc_err); - csm = ucsdet_detectAll(csd, &matches, &uc_err); + csm = ucsdet_detectAll (csd, &matches, &uc_err); for (i = 0; i < matches; i ++) { if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) { diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h index 5f436d99d..1a61339ca 100644 --- a/src/libmime/mime_encoding.h +++ b/src/libmime/mime_encoding.h @@ -19,10 +19,12 @@ #include "config.h" #include "mem_pool.h" #include "fstring.h" +#include struct rspamd_task; struct rspamd_mime_part; struct rspamd_mime_text_part; +struct rspamd_charset_converter; /** * Convert charset to a valid iconv charset @@ -87,5 +89,41 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, */ void rspamd_mime_charset_utf_enforce (gchar *in, gsize len); +/** + * Gets cached converter + * @param enc + * @param err + * @return + */ +struct rspamd_charset_converter *rspamd_mime_get_converter_cached ( + const gchar *enc, + UErrorCode *err); + +/** + * Performs charset->utf16 conversion + * @param cnv + * @param dest + * @param destCapacity + * @param src + * @param srcLength + * @param pErrorCode + * @return + */ +gint32 +rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv, + UChar *dest, + gint32 destCapacity, + const char *src, + gint32 srcLength, + UErrorCode *pErrorCode); + +/** + * Detect charset in text + * @param in + * @param inlen + * @return detected charset name or NULL + */ +const char *rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen); + #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */ -- 2.39.5