]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] No more magic
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 9 Sep 2019 15:26:16 +0000 (16:26 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 9 Sep 2019 15:26:16 +0000 (16:26 +0100)
CMakeLists.txt
src/libmime/message.c
src/libmime/message.h
src/libserver/cfg_file.h
src/libserver/cfg_rcl.c
src/libutil/util.c
src/rspamd.h

index 22c4b817b29fd7bce8166084f75a56c4c21b1306..9522143911da76743e946735324c12699507bb98 100644 (file)
@@ -638,8 +638,6 @@ ProcessPackage(LIBCRYPT LIBRARY crypto INCLUDE openssl/evp.h
        ROOT ${OPENSSL_ROOT_DIR} MODULES openssl libcrypt)
 ProcessPackage(LIBSSL LIBRARY ssl INCLUDE openssl/ssl.h
        ROOT ${OPENSSL_ROOT_DIR} MODULES openssl libssl)
-ProcessPackage(MAGIC LIBRARY magic INCLUDE magic.h INCLUDE_SUFFIXES include/libmagic
-       ROOT ${LIBMAGIC_ROOT_DIR} MODULES magic)
 ProcessPackage(LIBZ LIBRARY z INCLUDE zlib.h INCLUDE_SUFFIXES include/zlib
                ROOT ${LIBZ_ROOT_DIR} MODULES z)
 ProcessPackage(SODIUM LIBRARY sodium INCLUDE sodium.h
index 92fa1f51bf954e624a2772decb43261b10239689..00067ee83634d190788fc4f4f880f1d9724571a2 100644 (file)
@@ -818,98 +818,19 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
 
        if (IS_CT_TEXT (mime_part->ct) && (!mime_part->detected_ct ||
                                                                           IS_CT_TEXT (mime_part->detected_ct))) {
+               found_txt = TRUE;
+
                html_tok.begin = "html";
                html_tok.len = 4;
                xhtml_tok.begin = "xhtml";
                xhtml_tok.len = 5;
 
                if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0 ||
-                               rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0) {
+                       rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0 ||
+                       (mime_part->detected_ct &&
+                               rspamd_ftok_casecmp (&mime_part->detected_ct->subtype, &html_tok) == 0)) {
                        found_html = TRUE;
                }
-               else {
-                       /*
-                        * We also need to apply heuristic for text parts that are actually
-                        * HTML.
-                        */
-                       RSPAMD_FTOK_ASSIGN (&html_tok, "<!DOCTYPE html");
-                       RSPAMD_FTOK_ASSIGN (&xhtml_tok, "<html");
-
-                       if (mime_part->parsed_data.len >= xhtml_tok.len &&
-                                       rspamd_lc_cmp (mime_part->parsed_data.begin,
-                                                       xhtml_tok.begin, xhtml_tok.len) == 0) {
-                               found_html = TRUE;
-                       }
-                       else if (mime_part->parsed_data.len >= html_tok.len &&
-                                       rspamd_lc_cmp (mime_part->parsed_data.begin,
-                                                       html_tok.begin, html_tok.len) == 0) {
-                               found_html = TRUE;
-                       }
-                       else {
-                               /* We need to be extra careful with some stupid things here */
-
-                               html_tok.begin = "plain";
-                               html_tok.len = 5;
-
-                               if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0) {
-                                       found_txt = TRUE;
-                               }
-                               else {
-                                       if (mime_part->cd && mime_part->cd->filename.len > 4) {
-                                               const gchar *pos = mime_part->cd->filename.begin +
-                                                                                  mime_part->cd->filename.len -
-                                                                                  sizeof (".txt") + 1;
-                                               if (rspamd_lc_cmp (pos, ".txt", sizeof ("txt") - 1) == 0) {
-                                                       found_txt = TRUE;
-                                               }
-                                               else {
-                                                       msg_debug_task ("found mime part with incorrect content-type: %T/%T, "
-                                                                                  "filename: %T",
-                                                                       &mime_part->ct->type,
-                                                                       &mime_part->ct->subtype,
-                                                                       &mime_part->cd->filename);
-                                               }
-                                       }
-                                       else {
-                                               /* For something like Content-Type: text */
-                                               found_txt = TRUE;
-                                       }
-                               }
-                       }
-
-                       if (found_html) {
-                               msg_info_task ("found html part pretending to be text/plain part");
-                       }
-               }
-       }
-       else {
-               /* Apply heuristic */
-
-               if (mime_part->cd && mime_part->cd->filename.len > 4) {
-                       const gchar *pos = mime_part->cd->filename.begin +
-                                       mime_part->cd->filename.len - sizeof (".htm") + 1;
-
-                       if (rspamd_lc_cmp (pos, ".htm", sizeof (".htm") - 1) == 0) {
-                               found_html = TRUE;
-                       }
-                       else if (rspamd_lc_cmp (pos, ".txt", sizeof ("txt") - 1) == 0) {
-                               found_txt = TRUE;
-                       }
-                       else if ( mime_part->cd->filename.len > 5) {
-                               pos = mime_part->cd->filename.begin +
-                                               mime_part->cd->filename.len - sizeof (".html") + 1;
-                               if (rspamd_lc_cmp (pos, ".html", sizeof (".html") - 1) == 0) {
-                                       found_html = TRUE;
-                               }
-                       }
-               }
-
-               if (found_txt || found_html) {
-                       msg_info_task ("found %s part with incorrect content-type: %T/%T",
-                                       found_html ? "html" : "text",
-                                       &mime_part->ct->type, &mime_part->ct->subtype);
-                       mime_part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
-               }
        }
 
        /* Skip attachments */
@@ -1006,7 +927,7 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 {
        struct rspamd_content_type *ct = NULL;
        struct rspamd_mime_part *part;
-       const char *mb = NULL;
+       const char *mb = "application/octet-stream";
        gchar *mid;
        rspamd_ftok_t srch, *tok;
        gchar cdbuf[1024];
@@ -1015,6 +936,14 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
 
        part = rspamd_mempool_alloc0 (task->task_pool, sizeof (*part));
 
+       part->raw_data.begin = start;
+       part->raw_data.len = len;
+       part->parsed_data.begin = start;
+       part->parsed_data.len = len;
+       part->id = MESSAGE_FIELD (task, parts)->len;
+       part->raw_headers = rspamd_message_headers_new ();
+       part->headers_order = NULL;
+
        tok = rspamd_task_get_request_header (task, "Content-Type");
 
        if (tok) {
@@ -1023,11 +952,42 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
                                task->task_pool);
                part->ct = ct;
        }
+       else if (task->cfg && task->cfg->libs_ctx) {
+               lua_State *L = task->cfg->lua_state;
+
+               if (rspamd_lua_require_function (L,
+                               "lua_magic", "detect_mime_part")) {
+
+                       struct rspamd_mime_part **pmime;
+                       struct rspamd_task **ptask;
 
-       if (task->cfg && task->cfg->libs_ctx) {
-               mb = magic_buffer (task->cfg->libs_ctx->libmagic,
-                               start,
-                               len);
+                       pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *));
+                       rspamd_lua_setclass (L, "rspamd{mimepart}", -1);
+                       *pmime = part;
+                       ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
+                       rspamd_lua_setclass (L, "rspamd{task}", -1);
+                       *ptask = task;
+
+                       if (lua_pcall (L, 2, 2, 0) != 0) {
+                               msg_err_task ("cannot detect type: %s", lua_tostring (L, -1));
+                       }
+                       else {
+                               if (lua_istable (L, -1)) {
+                                       lua_pushstring (L, "ct");
+                                       lua_gettable (L, -2);
+
+                                       if (lua_isstring (L, -1)) {
+                                               mb = rspamd_mempool_strdup (task->task_pool,
+                                                               lua_tostring (L, -1));
+                                       }
+                               }
+                       }
+
+                       lua_settop (L, 0);
+               }
+               else {
+                       msg_err_task ("cannot require lua_magic.detect_mime_part");
+               }
 
                if (mb) {
                        srch.begin = mb;
@@ -1059,13 +1019,6 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start,
                }
        }
 
-       part->raw_data.begin = start;
-       part->raw_data.len = len;
-       part->parsed_data.begin = start;
-       part->parsed_data.len = len;
-       part->id = MESSAGE_FIELD (task, parts)->len;
-       part->raw_headers = rspamd_message_headers_new ();
-       part->headers_order = NULL;
 
        tok = rspamd_task_get_request_header (task, "Filename");
 
@@ -1408,31 +1361,81 @@ rspamd_message_process (struct rspamd_task *task)
        gdouble diff, *pdiff;
        guint tw, *ptw, dw;
        struct rspamd_mime_part *part;
+       lua_State *L = task->cfg->lua_state;
+       gint func_pos = -1;
 
        rspamd_images_process (task);
        rspamd_archives_process (task);
 
+       if (rspamd_lua_require_function (L,
+                       "lua_magic", "detect_mime_part")) {
+               func_pos = lua_gettop (L);
+       }
+       else {
+               msg_err_task ("cannot require lua_magic.detect_mime_part");
+       }
+
        PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) {
-               if (!rspamd_message_process_text_part_maybe (task, part) &&
-                               part->parsed_data.len > 0) {
-                       if (task->cfg) {
-                               const gchar *mb = magic_buffer (task->cfg->libs_ctx->libmagic,
-                                               part->parsed_data.begin,
-                                               part->parsed_data.len);
-
-                               if (mb) {
-                                       rspamd_ftok_t srch;
-
-                                       srch.begin = mb;
-                                       srch.len = strlen (mb);
-                                       part->detected_ct = rspamd_content_type_parse (srch.begin,
-                                                       srch.len,
-                                                       task->task_pool);
+               if (func_pos != -1) {
+                       struct rspamd_mime_part **pmime;
+                       struct rspamd_task **ptask;
+
+                       lua_pushvalue (L, func_pos);
+                       pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *));
+                       rspamd_lua_setclass (L, "rspamd{mimepart}", -1);
+                       *pmime = part;
+                       ptask = lua_newuserdata (L, sizeof (struct rspamd_task *));
+                       rspamd_lua_setclass (L, "rspamd{task}", -1);
+                       *ptask = task;
+
+                       if (lua_pcall (L, 2, 2, 0) != 0) {
+                               msg_err_task ("cannot detect type: %s", lua_tostring (L, -1));
+                       }
+                       else {
+                               if (lua_istable (L, -1)) {
+                                       const gchar *mb;
+
+                                       /* First returned value */
+                                       part->detected_ext = rspamd_mempool_strdup (task->task_pool,
+                                                       lua_tostring (L, -2));
+
+                                       lua_pushstring (L, "ct");
+                                       lua_gettable (L, -2);
+
+                                       if (lua_isstring (L, -1)) {
+                                               mb = lua_tostring (L, -1);
+
+                                               if (mb) {
+                                                       rspamd_ftok_t srch;
+
+                                                       srch.begin = mb;
+                                                       srch.len = strlen (mb);
+                                                       part->detected_ct = rspamd_content_type_parse (srch.begin,
+                                                                       srch.len,
+                                                                       task->task_pool);
+                                               }
+                                       }
+
+                                       lua_pop (L, 1);
+
+                                       lua_pushstring (L, "type");
+                                       lua_gettable (L, -2);
+
+                                       if (lua_isstring (L, -1)) {
+                                               part->detected_type = rspamd_mempool_strdup (task->task_pool,
+                                                               lua_tostring (L, -1));
+                                       }
                                }
                        }
+
+                       lua_settop (L, func_pos);
                }
+
+               rspamd_message_process_text_part_maybe (task, part);
        }
 
+       lua_settop (L, 0);
+
        /* Calculate average words length and number of short words */
        struct rspamd_mime_text_part *text_part;
        gdouble *var;
index 651e1d457bbf7dcb03ae3037dbf1c7a24760275a..374d3a7f98e5a519bbc490de02ee213188c5f193 100644 (file)
@@ -56,6 +56,8 @@ struct rspamd_mime_multipart {
 struct rspamd_mime_part {
        struct rspamd_content_type *ct;
        struct rspamd_content_type *detected_ct;
+       gchar *detected_type;
+       gchar *detected_ext;
        struct rspamd_content_disposition *cd;
        rspamd_ftok_t raw_data;
        rspamd_ftok_t parsed_data;
index 4faca7b56c5e269a9b5c6cb59413020756f769b4..263d00f38c0d5a60461e88c127c4c67ab970ceb2 100644 (file)
@@ -437,7 +437,6 @@ struct rspamd_config {
        gchar *history_file;                           /**< file to save rolling history                                                */
        gchar *tld_file;                               /**< file to load effective tld list from                                */
        gchar *hs_cache_dir;                           /**< directory to save hyperscan databases                               */
-       gchar *magic_file;                             /**< file to initialize libmagic                                         */
 
        gdouble dns_timeout;                            /**< timeout in milliseconds for waiting for dns reply  */
        guint32 dns_retransmits;                        /**< maximum retransmits count                                                  */
index 2bdb6adc6ee0c25a513f218a9dfba33b85a82462..fb2cbf0520a5877e68380d28f862441ca3935e21 100644 (file)
@@ -2092,12 +2092,6 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections)
                                G_STRUCT_OFFSET (struct rspamd_config, ssl_ciphers),
                                0,
                                "List of ssl ciphers (e.g. HIGH:!aNULL:!kRSA:!PSK:!SRP:!MD5:!RC4)");
-               rspamd_rcl_add_default_handler (sub,
-                               "magic_file",
-                               rspamd_rcl_parse_struct_string,
-                               G_STRUCT_OFFSET (struct rspamd_config, magic_file),
-                               0,
-                               "Path to a custom libmagic file");
                rspamd_rcl_add_default_handler (sub,
                                "max_message",
                                rspamd_rcl_parse_struct_integer,
index 86358e46eba090fcc3ef06c4f540c23ca051902b..7877582c27dd5222a51da4fd5fc3e0c9c3e71229 100644 (file)
@@ -2364,35 +2364,6 @@ rspamd_init_libs (void)
        rlim.rlim_max = rlim.rlim_cur;
        setrlimit (RLIMIT_STACK, &rlim);
 
-       gint magic_flags = 0;
-
-       /* Unless trusty and other crap is supported... */
-#if 0
-#ifdef MAGIC_NO_CHECK_BUILTIN
-       magic_flags = MAGIC_NO_CHECK_BUILTIN;
-#endif
-#endif
-       magic_flags |= MAGIC_MIME|MAGIC_NO_CHECK_COMPRESS|
-                                  MAGIC_NO_CHECK_ELF|MAGIC_NO_CHECK_TAR;
-#ifdef MAGIC_NO_CHECK_CDF
-       magic_flags |= MAGIC_NO_CHECK_CDF;
-#endif
-#ifdef MAGIC_NO_CHECK_ENCODING
-       magic_flags |= MAGIC_NO_CHECK_ENCODING;
-#endif
-#ifdef MAGIC_NO_CHECK_TAR
-       magic_flags |= MAGIC_NO_CHECK_TAR;
-#endif
-#ifdef MAGIC_NO_CHECK_TEXT
-       magic_flags |= MAGIC_NO_CHECK_TEXT;
-#endif
-#ifdef MAGIC_NO_CHECK_TOKENS
-       magic_flags |= MAGIC_NO_CHECK_TOKENS;
-#endif
-#ifdef MAGIC_NO_CHECK_JSON
-       magic_flags |= MAGIC_NO_CHECK_JSON;
-#endif
-       ctx->libmagic = magic_open (magic_flags);
        ctx->local_addrs = rspamd_inet_library_init ();
        REF_INIT_RETAIN (ctx, rspamd_deinit_libs);
 
@@ -2473,10 +2444,6 @@ rspamd_config_libs (struct rspamd_external_libs_ctx *ctx,
                        }
                }
 
-               if (ctx->libmagic) {
-                       magic_load (ctx->libmagic, cfg->magic_file);
-               }
-
                rspamd_free_zstd_dictionary (ctx->in_dict);
                rspamd_free_zstd_dictionary (ctx->out_dict);
 
@@ -2586,10 +2553,6 @@ void
 rspamd_deinit_libs (struct rspamd_external_libs_ctx *ctx)
 {
        if (ctx != NULL) {
-               if (ctx->libmagic) {
-                       magic_close (ctx->libmagic);
-               }
-
                g_free (ctx->ottery_cfg);
 
 #ifdef HAVE_OPENSSL
index 0a0fb45fc80ab9fad5aad9679a834dd745c1c4fc..ea11965fb969052fed500505fb70d0061bcbe730 100644 (file)
@@ -33,8 +33,6 @@
 #include "libserver/task.h"
 
 #include <openssl/ssl.h>
-#include <magic.h>
-
 
 /* Default values */
 #define FIXED_CONFIG_FILE RSPAMD_CONFDIR "/rspamd.conf"
@@ -353,7 +351,6 @@ struct zstd_dictionary {
 struct rspamd_radix_map_helper;
 
 struct rspamd_external_libs_ctx {
-       magic_t libmagic;
        struct rspamd_radix_map_helper **local_addrs;
        struct rspamd_cryptobox_library_ctx *crypto_ctx;
        struct ottery_config *ottery_cfg;