]> source.dussan.org Git - rspamd.git/commitdiff
[Fetaure] Add method that detects 8 bit characters in text parts
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 24 Apr 2017 15:52:59 +0000 (16:52 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 1 May 2017 13:03:04 +0000 (14:03 +0100)
src/libmime/message.h
src/libmime/mime_encoding.c

index 22f4fd24dd4e90a9c3598a9f678198119ad20e76..a6a6f05952b2c4643691603d543b5f51db66f62c 100644 (file)
@@ -69,6 +69,8 @@ struct rspamd_mime_part {
 #define RSPAMD_MIME_TEXT_PART_FLAG_BALANCED (1 << 1)
 #define RSPAMD_MIME_TEXT_PART_FLAG_EMPTY (1 << 2)
 #define RSPAMD_MIME_TEXT_PART_FLAG_HTML (1 << 3)
+#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT (1 << 4)
+#define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED (1 << 5)
 
 #define IS_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY)
 #define IS_PART_UTF(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF)
index fdcd19c7bc2066667159cb5256aa0e56c6651b3d..11f764e11771a9468bd9b8600b1358e5487a65a6 100644 (file)
@@ -455,6 +455,36 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
        return FALSE;
 }
 
+/* https://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord */
+#define hasmore(x,n) (((x)+~0UL/255*(127-(n))|(x))&~0UL/255*128)
+
+static inline gboolean
+rspamd_mime_has_8bit (const guchar *beg, gsize len)
+{
+       unsigned long *w;
+       gsize i, leftover = len % sizeof (*w);
+
+       w = (unsigned long *)beg;
+
+       for (i = 0; i < len / sizeof (*w); i ++) {
+               if (hasmore (*w, 127)) {
+                       return TRUE;
+               }
+
+               w ++;
+       }
+
+       beg = (const guchar *)w;
+
+       for (i = 0; i < leftover; i ++) {
+               if (beg[i] > 127) {
+                       return TRUE;
+               }
+       }
+
+       return FALSE;
+}
+
 GByteArray *
 rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                struct rspamd_mime_text_part *text_part)
@@ -468,12 +498,27 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
        rspamd_ftok_t charset_tok;
        struct rspamd_mime_part *part = text_part->mime_part;
 
+       if (rspamd_mime_has_8bit (text_part->raw.begin, text_part->raw.len)) {
+               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT;
+       }
+
        part_content = rspamd_mempool_alloc0 (task->task_pool, sizeof (GByteArray));
        part_content->data = rspamd_mempool_alloc (task->task_pool,
                        text_part->parsed.len);
        memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
        part_content->len = text_part->parsed.len;
 
+       if (rspamd_mime_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
+               text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED;
+       }
+
+       if (!(text_part->flags & RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED)) {
+               /* We don't care anymore about encoding */
+               SET_PART_UTF (text_part);
+
+               return part_content;
+       }
+
        if (task->cfg && task->cfg->raw_mode) {
                SET_PART_RAW (text_part);
                return part_content;