aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-12-19 12:11:04 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-12-19 12:11:04 +0000
commit60fdd0ae9bcca8e10a1e99dee3d0441ef4d1df19 (patch)
treefa25808d14e5efd4bb2d567dc052abb78b0019cb
parentefb04bc038792671dced3b15cc1dca5d3f94e2cc (diff)
downloadrspamd-60fdd0ae9bcca8e10a1e99dee3d0441ef4d1df19.tar.gz
rspamd-60fdd0ae9bcca8e10a1e99dee3d0441ef4d1df19.zip
[Feature] Implement new unicode normalizer
Previously, Rspamd gave up when it found non-valid utf-8 characters. Now, it just replaces them with '?' and continues processing. It looks more reasonable when processing emails.
-rw-r--r--src/libmime/mime_encoding.c69
1 files changed, 51 insertions, 18 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index ca916c015..4332ab379 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -200,6 +200,49 @@ rspamd_text_to_utf8 (rspamd_mempool_t *pool,
return dst->str;
}
+gboolean
+rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, gchar *in, gsize len)
+{
+ const gchar *end, *p;
+ gsize remain = len;
+
+ if (utf_compatible_re == NULL) {
+ utf_compatible_re = rspamd_regexp_new (
+ "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi)|(?:us)|(?:ISO-8859-1)|"
+ "(?:latin.*)|(?:CSASCII)$",
+ "i", NULL);
+ }
+
+ if (rspamd_regexp_match (utf_compatible_re, charset->begin, charset->len,
+ TRUE)) {
+ /* Now we validate input and replace bad characters with '?' symbol */
+ p = in;
+
+ while (remain > 0 && !g_utf8_validate (p, remain, &end)) {
+ gchar *valid;
+
+ valid = g_utf8_find_next_char (end, in + len);
+
+ if (!valid) {
+ valid = in + len;
+ }
+
+ if (valid > end) {
+ memset ((gchar *)end, '?', valid - end);
+ p = valid;
+ remain = (in + len) - p;
+ }
+ else {
+ break;
+ }
+ }
+
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
GByteArray *
rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part)
@@ -209,6 +252,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
const gchar *charset;
gchar *res_str;
GByteArray *result_array, *part_content;
+ rspamd_ftok_t charset_tok;
struct rspamd_mime_part *part = text_part->mime_part;
part_content = rspamd_mempool_alloc0 (task->task_pool, sizeof (GByteArray));
@@ -220,13 +264,6 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
return part_content;
}
- if (utf_compatible_re == NULL) {
- utf_compatible_re = rspamd_regexp_new (
- "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi)|(?:us)|(?:ISO-8859-1)|"
- "(?:latin.*)|(?:CSASCII)$",
- "i", NULL);
- }
-
if (part->ct->charset.len == 0) {
SET_PART_RAW (text_part);
return part_content;
@@ -241,17 +278,13 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
return part_content;
}
- if (rspamd_regexp_match (utf_compatible_re, charset, strlen (charset), TRUE)) {
- if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
- SET_PART_UTF (text_part);
- return part_content;
- }
- else {
- msg_info_task ("<%s>: contains invalid utf8 characters, assume it as raw",
- task->message_id);
- SET_PART_RAW (text_part);
- return part_content;
- }
+ RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
+
+ if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
+ part_content->len)) {
+ SET_PART_UTF (text_part);
+
+ return part_content;
}
else {
res_str = rspamd_text_to_utf8 (task->task_pool, part_content->data,