From: Vsevolod Stakhov Date: Mon, 9 Sep 2019 15:26:16 +0000 (+0100) Subject: [Rework] No more magic X-Git-Tag: 2.0~238 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=56e236efa012c4be6b3893314ce4d3a570e16327;p=rspamd.git [Rework] No more magic --- diff --git a/CMakeLists.txt b/CMakeLists.txt index 22c4b817b..952214391 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -638,8 +638,6 @@ ProcessPackage(LIBCRYPT LIBRARY crypto INCLUDE openssl/evp.h ROOT ${OPENSSL_ROOT_DIR} MODULES openssl libcrypt) ProcessPackage(LIBSSL LIBRARY ssl INCLUDE openssl/ssl.h ROOT ${OPENSSL_ROOT_DIR} MODULES openssl libssl) -ProcessPackage(MAGIC LIBRARY magic INCLUDE magic.h INCLUDE_SUFFIXES include/libmagic - ROOT ${LIBMAGIC_ROOT_DIR} MODULES magic) ProcessPackage(LIBZ LIBRARY z INCLUDE zlib.h INCLUDE_SUFFIXES include/zlib ROOT ${LIBZ_ROOT_DIR} MODULES z) ProcessPackage(SODIUM LIBRARY sodium INCLUDE sodium.h diff --git a/src/libmime/message.c b/src/libmime/message.c index 92fa1f51b..00067ee83 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -818,98 +818,19 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, if (IS_CT_TEXT (mime_part->ct) && (!mime_part->detected_ct || IS_CT_TEXT (mime_part->detected_ct))) { + found_txt = TRUE; + html_tok.begin = "html"; html_tok.len = 4; xhtml_tok.begin = "xhtml"; xhtml_tok.len = 5; if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0 || - rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0) { + rspamd_ftok_casecmp (&mime_part->ct->subtype, &xhtml_tok) == 0 || + (mime_part->detected_ct && + rspamd_ftok_casecmp (&mime_part->detected_ct->subtype, &html_tok) == 0)) { found_html = TRUE; } - else { - /* - * We also need to apply heuristic for text parts that are actually - * HTML. - */ - RSPAMD_FTOK_ASSIGN (&html_tok, "parsed_data.len >= xhtml_tok.len && - rspamd_lc_cmp (mime_part->parsed_data.begin, - xhtml_tok.begin, xhtml_tok.len) == 0) { - found_html = TRUE; - } - else if (mime_part->parsed_data.len >= html_tok.len && - rspamd_lc_cmp (mime_part->parsed_data.begin, - html_tok.begin, html_tok.len) == 0) { - found_html = TRUE; - } - else { - /* We need to be extra careful with some stupid things here */ - - html_tok.begin = "plain"; - html_tok.len = 5; - - if (rspamd_ftok_casecmp (&mime_part->ct->subtype, &html_tok) == 0) { - found_txt = TRUE; - } - else { - if (mime_part->cd && mime_part->cd->filename.len > 4) { - const gchar *pos = mime_part->cd->filename.begin + - mime_part->cd->filename.len - - sizeof (".txt") + 1; - if (rspamd_lc_cmp (pos, ".txt", sizeof ("txt") - 1) == 0) { - found_txt = TRUE; - } - else { - msg_debug_task ("found mime part with incorrect content-type: %T/%T, " - "filename: %T", - &mime_part->ct->type, - &mime_part->ct->subtype, - &mime_part->cd->filename); - } - } - else { - /* For something like Content-Type: text */ - found_txt = TRUE; - } - } - } - - if (found_html) { - msg_info_task ("found html part pretending to be text/plain part"); - } - } - } - else { - /* Apply heuristic */ - - if (mime_part->cd && mime_part->cd->filename.len > 4) { - const gchar *pos = mime_part->cd->filename.begin + - mime_part->cd->filename.len - sizeof (".htm") + 1; - - if (rspamd_lc_cmp (pos, ".htm", sizeof (".htm") - 1) == 0) { - found_html = TRUE; - } - else if (rspamd_lc_cmp (pos, ".txt", sizeof ("txt") - 1) == 0) { - found_txt = TRUE; - } - else if ( mime_part->cd->filename.len > 5) { - pos = mime_part->cd->filename.begin + - mime_part->cd->filename.len - sizeof (".html") + 1; - if (rspamd_lc_cmp (pos, ".html", sizeof (".html") - 1) == 0) { - found_html = TRUE; - } - } - } - - if (found_txt || found_html) { - msg_info_task ("found %s part with incorrect content-type: %T/%T", - found_html ? "html" : "text", - &mime_part->ct->type, &mime_part->ct->subtype); - mime_part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; - } } /* Skip attachments */ @@ -1006,7 +927,7 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start, { struct rspamd_content_type *ct = NULL; struct rspamd_mime_part *part; - const char *mb = NULL; + const char *mb = "application/octet-stream"; gchar *mid; rspamd_ftok_t srch, *tok; gchar cdbuf[1024]; @@ -1015,6 +936,14 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start, part = rspamd_mempool_alloc0 (task->task_pool, sizeof (*part)); + part->raw_data.begin = start; + part->raw_data.len = len; + part->parsed_data.begin = start; + part->parsed_data.len = len; + part->id = MESSAGE_FIELD (task, parts)->len; + part->raw_headers = rspamd_message_headers_new (); + part->headers_order = NULL; + tok = rspamd_task_get_request_header (task, "Content-Type"); if (tok) { @@ -1023,11 +952,42 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start, task->task_pool); part->ct = ct; } + else if (task->cfg && task->cfg->libs_ctx) { + lua_State *L = task->cfg->lua_state; + + if (rspamd_lua_require_function (L, + "lua_magic", "detect_mime_part")) { + + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; - if (task->cfg && task->cfg->libs_ctx) { - mb = magic_buffer (task->cfg->libs_ctx->libmagic, - start, - len); + pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *)); + rspamd_lua_setclass (L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata (L, sizeof (struct rspamd_task *)); + rspamd_lua_setclass (L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall (L, 2, 2, 0) != 0) { + msg_err_task ("cannot detect type: %s", lua_tostring (L, -1)); + } + else { + if (lua_istable (L, -1)) { + lua_pushstring (L, "ct"); + lua_gettable (L, -2); + + if (lua_isstring (L, -1)) { + mb = rspamd_mempool_strdup (task->task_pool, + lua_tostring (L, -1)); + } + } + } + + lua_settop (L, 0); + } + else { + msg_err_task ("cannot require lua_magic.detect_mime_part"); + } if (mb) { srch.begin = mb; @@ -1059,13 +1019,6 @@ rspamd_message_from_data (struct rspamd_task *task, const guchar *start, } } - part->raw_data.begin = start; - part->raw_data.len = len; - part->parsed_data.begin = start; - part->parsed_data.len = len; - part->id = MESSAGE_FIELD (task, parts)->len; - part->raw_headers = rspamd_message_headers_new (); - part->headers_order = NULL; tok = rspamd_task_get_request_header (task, "Filename"); @@ -1408,31 +1361,81 @@ rspamd_message_process (struct rspamd_task *task) gdouble diff, *pdiff; guint tw, *ptw, dw; struct rspamd_mime_part *part; + lua_State *L = task->cfg->lua_state; + gint func_pos = -1; rspamd_images_process (task); rspamd_archives_process (task); + if (rspamd_lua_require_function (L, + "lua_magic", "detect_mime_part")) { + func_pos = lua_gettop (L); + } + else { + msg_err_task ("cannot require lua_magic.detect_mime_part"); + } + PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, part) { - if (!rspamd_message_process_text_part_maybe (task, part) && - part->parsed_data.len > 0) { - if (task->cfg) { - const gchar *mb = magic_buffer (task->cfg->libs_ctx->libmagic, - part->parsed_data.begin, - part->parsed_data.len); - - if (mb) { - rspamd_ftok_t srch; - - srch.begin = mb; - srch.len = strlen (mb); - part->detected_ct = rspamd_content_type_parse (srch.begin, - srch.len, - task->task_pool); + if (func_pos != -1) { + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + + lua_pushvalue (L, func_pos); + pmime = lua_newuserdata (L, sizeof (struct rspamd_mime_part *)); + rspamd_lua_setclass (L, "rspamd{mimepart}", -1); + *pmime = part; + ptask = lua_newuserdata (L, sizeof (struct rspamd_task *)); + rspamd_lua_setclass (L, "rspamd{task}", -1); + *ptask = task; + + if (lua_pcall (L, 2, 2, 0) != 0) { + msg_err_task ("cannot detect type: %s", lua_tostring (L, -1)); + } + else { + if (lua_istable (L, -1)) { + const gchar *mb; + + /* First returned value */ + part->detected_ext = rspamd_mempool_strdup (task->task_pool, + lua_tostring (L, -2)); + + lua_pushstring (L, "ct"); + lua_gettable (L, -2); + + if (lua_isstring (L, -1)) { + mb = lua_tostring (L, -1); + + if (mb) { + rspamd_ftok_t srch; + + srch.begin = mb; + srch.len = strlen (mb); + part->detected_ct = rspamd_content_type_parse (srch.begin, + srch.len, + task->task_pool); + } + } + + lua_pop (L, 1); + + lua_pushstring (L, "type"); + lua_gettable (L, -2); + + if (lua_isstring (L, -1)) { + part->detected_type = rspamd_mempool_strdup (task->task_pool, + lua_tostring (L, -1)); + } } } + + lua_settop (L, func_pos); } + + rspamd_message_process_text_part_maybe (task, part); } + lua_settop (L, 0); + /* Calculate average words length and number of short words */ struct rspamd_mime_text_part *text_part; gdouble *var; diff --git a/src/libmime/message.h b/src/libmime/message.h index 651e1d457..374d3a7f9 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -56,6 +56,8 @@ struct rspamd_mime_multipart { struct rspamd_mime_part { struct rspamd_content_type *ct; struct rspamd_content_type *detected_ct; + gchar *detected_type; + gchar *detected_ext; struct rspamd_content_disposition *cd; rspamd_ftok_t raw_data; rspamd_ftok_t parsed_data; diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 4faca7b56..263d00f38 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -437,7 +437,6 @@ struct rspamd_config { gchar *history_file; /**< file to save rolling history */ gchar *tld_file; /**< file to load effective tld list from */ gchar *hs_cache_dir; /**< directory to save hyperscan databases */ - gchar *magic_file; /**< file to initialize libmagic */ gdouble dns_timeout; /**< timeout in milliseconds for waiting for dns reply */ guint32 dns_retransmits; /**< maximum retransmits count */ diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c index 2bdb6adc6..fb2cbf052 100644 --- a/src/libserver/cfg_rcl.c +++ b/src/libserver/cfg_rcl.c @@ -2092,12 +2092,6 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections) G_STRUCT_OFFSET (struct rspamd_config, ssl_ciphers), 0, "List of ssl ciphers (e.g. HIGH:!aNULL:!kRSA:!PSK:!SRP:!MD5:!RC4)"); - rspamd_rcl_add_default_handler (sub, - "magic_file", - rspamd_rcl_parse_struct_string, - G_STRUCT_OFFSET (struct rspamd_config, magic_file), - 0, - "Path to a custom libmagic file"); rspamd_rcl_add_default_handler (sub, "max_message", rspamd_rcl_parse_struct_integer, diff --git a/src/libutil/util.c b/src/libutil/util.c index 86358e46e..7877582c2 100644 --- a/src/libutil/util.c +++ b/src/libutil/util.c @@ -2364,35 +2364,6 @@ rspamd_init_libs (void) rlim.rlim_max = rlim.rlim_cur; setrlimit (RLIMIT_STACK, &rlim); - gint magic_flags = 0; - - /* Unless trusty and other crap is supported... */ -#if 0 -#ifdef MAGIC_NO_CHECK_BUILTIN - magic_flags = MAGIC_NO_CHECK_BUILTIN; -#endif -#endif - magic_flags |= MAGIC_MIME|MAGIC_NO_CHECK_COMPRESS| - MAGIC_NO_CHECK_ELF|MAGIC_NO_CHECK_TAR; -#ifdef MAGIC_NO_CHECK_CDF - magic_flags |= MAGIC_NO_CHECK_CDF; -#endif -#ifdef MAGIC_NO_CHECK_ENCODING - magic_flags |= MAGIC_NO_CHECK_ENCODING; -#endif -#ifdef MAGIC_NO_CHECK_TAR - magic_flags |= MAGIC_NO_CHECK_TAR; -#endif -#ifdef MAGIC_NO_CHECK_TEXT - magic_flags |= MAGIC_NO_CHECK_TEXT; -#endif -#ifdef MAGIC_NO_CHECK_TOKENS - magic_flags |= MAGIC_NO_CHECK_TOKENS; -#endif -#ifdef MAGIC_NO_CHECK_JSON - magic_flags |= MAGIC_NO_CHECK_JSON; -#endif - ctx->libmagic = magic_open (magic_flags); ctx->local_addrs = rspamd_inet_library_init (); REF_INIT_RETAIN (ctx, rspamd_deinit_libs); @@ -2473,10 +2444,6 @@ rspamd_config_libs (struct rspamd_external_libs_ctx *ctx, } } - if (ctx->libmagic) { - magic_load (ctx->libmagic, cfg->magic_file); - } - rspamd_free_zstd_dictionary (ctx->in_dict); rspamd_free_zstd_dictionary (ctx->out_dict); @@ -2586,10 +2553,6 @@ void rspamd_deinit_libs (struct rspamd_external_libs_ctx *ctx) { if (ctx != NULL) { - if (ctx->libmagic) { - magic_close (ctx->libmagic); - } - g_free (ctx->ottery_cfg); #ifdef HAVE_OPENSSL diff --git a/src/rspamd.h b/src/rspamd.h index 0a0fb45fc..ea11965fb 100644 --- a/src/rspamd.h +++ b/src/rspamd.h @@ -33,8 +33,6 @@ #include "libserver/task.h" #include -#include - /* Default values */ #define FIXED_CONFIG_FILE RSPAMD_CONFDIR "/rspamd.conf" @@ -353,7 +351,6 @@ struct zstd_dictionary { struct rspamd_radix_map_helper; struct rspamd_external_libs_ctx { - magic_t libmagic; struct rspamd_radix_map_helper **local_addrs; struct rspamd_cryptobox_library_ctx *crypto_ctx; struct ottery_config *ottery_cfg;