aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-02-23 14:59:18 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-02-23 15:04:10 +0000
commitacb2a7bd0d9bcc899acd1bd259b0d4e79a47b457 (patch)
tree8d481669c43344f88580c4497842b0fe899b591b /src
parentc77c0e3f7b343202e7e6c4811f176be2fe744555 (diff)
downloadrspamd-acb2a7bd0d9bcc899acd1bd259b0d4e79a47b457.tar.gz
rspamd-acb2a7bd0d9bcc899acd1bd259b0d4e79a47b457.zip
[Feature] Add charset detection for text parts
Diffstat (limited to 'src')
-rw-r--r--src/libmime/mime_encoding.c58
1 files changed, 52 insertions, 6 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index c90bb488d..4d5f71be1 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -22,6 +22,7 @@
#include "mime_encoding.h"
#include "message.h"
#include <unicode/ucnv.h>
+#include <unicode/ucsdet.h>
#define UTF8_CHARSET "UTF-8"
@@ -29,6 +30,7 @@
#define RSPAMD_CHARSET_FLAG_ASCII (1 << 1)
#define RSPAMD_CHARSET_CACHE_SIZE 32
+#define RSPAMD_CHARSET_MAX_CONTENT 512
#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
@@ -139,6 +141,7 @@ rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
{
gchar *ret = NULL, *h, *t;
struct rspamd_charset_substitution *s;
+ UErrorCode uc_err = U_ZERO_ERROR;
if (sub_hash == NULL) {
rspamd_mime_encoding_substitute_init ();
@@ -167,10 +170,10 @@ rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
s = g_hash_table_lookup (sub_hash, ret);
if (s) {
- return s->canon;
+ return ucnv_getStandardName (s->canon, "IANA", &uc_err);
}
- return ret;
+ return ucnv_getStandardName (ret, "IANA", &uc_err);
}
gchar *
@@ -375,6 +378,37 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
return FALSE;
}
+static const char *
+rspamd_mime_charset_find_by_content (gchar *in, gsize inlen)
+{
+ static UCharsetDetector *csd;
+ const UCharsetMatch **csm, *sel = NULL;
+ UErrorCode uc_err = U_ZERO_ERROR;
+ gint32 matches, i, max_conf = G_MININT32, conf;
+
+ if (csd == NULL) {
+ csd = ucsdet_open (&uc_err);
+
+ g_assert (csd != NULL);
+ }
+
+ ucsdet_setText (csd, in, inlen, &uc_err);
+ csm = ucsdet_detectAll(csd, &matches, &uc_err);
+
+ for (i = 0; i < matches; i ++) {
+ if ((conf = ucsdet_getConfidence (csm[i], &uc_err)) > max_conf) {
+ max_conf = conf;
+ sel = csm[i];
+ }
+ }
+
+ if (sel && max_conf > 50) {
+ return ucsdet_getName (sel, &uc_err);
+ }
+
+ return NULL;
+}
+
GByteArray *
rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part)
@@ -397,11 +431,23 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
}
if (part->ct->charset.len == 0) {
- SET_PART_RAW (text_part);
- return part_content;
+ charset = rspamd_mime_charset_find_by_content (part_content->data,
+ MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
+
+ if (charset != NULL) {
+ msg_info_task ("detected charset %s", charset);
+ }
}
+ else {
+ charset = rspamd_mime_detect_charset (&part->ct->charset,
+ task->task_pool);
- charset = rspamd_mime_detect_charset (&part->ct->charset, task->task_pool);
+ if (charset == NULL) {
+ charset = rspamd_mime_charset_find_by_content (part_content->data,
+ MIN (RSPAMD_CHARSET_MAX_CONTENT, part_content->len));
+ msg_info_task ("detected charset: %s", charset);
+ }
+ }
if (charset == NULL) {
msg_info_task ("<%s>: has invalid charset", task->message_id);
@@ -433,7 +479,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
SET_PART_RAW (text_part);
g_error_free (err);
- return part_content;
+ return NULL;
}
}