From ac7140f3be4c97d27f3da58500be1be1594b442c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 24 Apr 2017 16:52:59 +0100 Subject: [PATCH] [Fetaure] Add method that detects 8 bit characters in text parts --- src/libmime/message.h | 2 ++ src/libmime/mime_encoding.c | 45 +++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/src/libmime/message.h b/src/libmime/message.h index 22f4fd24d..a6a6f0595 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -69,6 +69,8 @@ struct rspamd_mime_part { #define RSPAMD_MIME_TEXT_PART_FLAG_BALANCED (1 << 1) #define RSPAMD_MIME_TEXT_PART_FLAG_EMPTY (1 << 2) #define RSPAMD_MIME_TEXT_PART_FLAG_HTML (1 << 3) +#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT (1 << 4) +#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED (1 << 5) #define IS_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY) #define IS_PART_UTF(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF) diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index fdcd19c7b..11f764e11 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -455,6 +455,36 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, return FALSE; } +/* https://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord */ +#define hasmore(x,n) (((x)+~0UL/255*(127-(n))|(x))&~0UL/255*128) + +static inline gboolean +rspamd_mime_has_8bit (const guchar *beg, gsize len) +{ + unsigned long *w; + gsize i, leftover = len % sizeof (*w); + + w = (unsigned long *)beg; + + for (i = 0; i < len / sizeof (*w); i ++) { + if (hasmore (*w, 127)) { + return TRUE; + } + + w ++; + } + + beg = (const guchar *)w; + + for (i = 0; i < leftover; i ++) { + if (beg[i] > 127) { + return TRUE; + } + } + + return FALSE; +} + GByteArray * rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, struct rspamd_mime_text_part *text_part) @@ -468,12 +498,27 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, rspamd_ftok_t charset_tok; struct rspamd_mime_part *part = text_part->mime_part; + if (rspamd_mime_has_8bit (text_part->raw.begin, text_part->raw.len)) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT; + } + part_content = rspamd_mempool_alloc0 (task->task_pool, sizeof (GByteArray)); part_content->data = rspamd_mempool_alloc (task->task_pool, text_part->parsed.len); memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len); part_content->len = text_part->parsed.len; + if (rspamd_mime_has_8bit (text_part->parsed.begin, text_part->parsed.len)) { + text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED; + } + + if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) { + /* We don't care anymore about encoding */ + SET_PART_UTF (text_part); + + return part_content; + } + if (task->cfg && task->cfg->raw_mode) { SET_PART_RAW (text_part); return part_content; -- 2.39.5