From ec5260928f2660093c7ac0a366aa1c12221ed737 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 14 Jul 2016 17:13:48 +0100 Subject: [PATCH] [Feature] Add a trivial heuristic for codepages --- src/libmime/message.c | 45 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 5f31cba59..4d955c8e9 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -327,10 +327,12 @@ charset_validate (rspamd_mempool_t *pool, const gchar *in, gchar **out) begin ++; changed = TRUE; } - if (!g_ascii_islower(*begin)) { + + if (g_ascii_islower (*begin)) { changed = TRUE; to_uppercase = TRUE; } + end = begin + strlen (begin) - 1; while (!g_ascii_isalnum (*end)) { end --; @@ -363,6 +365,34 @@ charset_validate (rspamd_mempool_t *pool, const gchar *in, gchar **out) return TRUE; } +static const gchar * +charset_heuristic_detection (const gchar *in, rspamd_mempool_t *pool) +{ + gchar *ret = NULL, *h, *t; + + if (strchr (in, '-') != NULL) { + /* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */ + ret = rspamd_mempool_strdup (pool, in); + + h = ret; + t = ret; + + while (*h != '\0') { + if (*h != '-') { + *t++ = *h; + } + + h ++; + } + + *t = '\0'; + + return ret; + } + + return in; +} + static GQuark converter_error_quark (void) { @@ -383,9 +413,16 @@ rspamd_text_to_utf8 (struct rspamd_task *task, ic = iconv_open (UTF8_CHARSET, in_enc); if (ic == (iconv_t)-1) { - g_set_error (err, converter_error_quark(), EINVAL, - "cannot open iconv for: %s", in_enc); - return NULL; + in_enc = charset_heuristic_detection (in_enc, task->task_pool); + + ic = iconv_open (UTF8_CHARSET, in_enc); + + if (ic == (iconv_t)-1) { + g_set_error (err, converter_error_quark(), EINVAL, + "cannot open iconv for: %s", in_enc); + + return NULL; + } } /* Preallocate for half of characters to be converted */ -- 2.39.5