From b22485f657c40c9b9fda9675d25c9294288c5732 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 14 Jan 2016 13:30:55 +0000 Subject: [PATCH] Allow processing images urls for SURBL --- doc/markdown/modules/surbl.md | 24 ++++++++ src/libserver/html.c | 105 +++++++++++++++++++++------------- src/libserver/html.h | 12 ++++ src/plugins/surbl.c | 47 +++++++++++++++ src/plugins/surbl.h | 1 + 5 files changed, 150 insertions(+), 39 deletions(-) diff --git a/doc/markdown/modules/surbl.md b/doc/markdown/modules/surbl.md index 30655b794..84f43b8c0 100644 --- a/doc/markdown/modules/surbl.md +++ b/doc/markdown/modules/surbl.md @@ -46,6 +46,8 @@ surbl { } rule { suffix = "uribl.rambler.ru"; + # Also check images + images = true; symbol = "RAMBLER_URIBL"; } rule { @@ -77,6 +79,28 @@ In general, the configuration of `surbl` module is definition of DNS lists. Each list must have suffix that defines the list itself and optionally for some lists it is possible to specify either `bit` or `ips` sections. +Since some URL lists do not accept `IP` addresses, it is also possible to disable sending of URLs with IP address in the host to such lists. That could be done by specifying `noip = true` option: + +~~~nginx + rule { + suffix = "dbl.spamhaus.org"; + symbol = "DBL"; + # Do not check numeric URL's + noip = true; + } +~~~ + +It is also possible to check HTML images URLs using URL blacklists. Just specify `images = true` for such list and you are done: + +~~~nginx + rule { + suffix = "uribl.rambler.ru"; + # Also check images + images = true; + symbol = "RAMBLER_URIBL"; + } +~~~ + ## Principles of operation In this section, we define how `surbl` module performs its checks. diff --git a/src/libserver/html.c b/src/libserver/html.c index 29922b133..5c55d6f30 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1278,60 +1278,85 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool, *statep = state; } -static struct rspamd_url * -rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag) +struct rspamd_url * +rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, + struct html_tag_component *comp) { - struct html_tag_component *comp; struct rspamd_url *url; - GList *cur; - const guchar *p; gchar *decoded; gint rc; gsize decoded_len; gboolean has_spaces = FALSE; + const gchar *p; - cur = tag->params->head; + p = start; - while (cur) { - comp = cur->data; + /* Strip spaces from the url */ + /* Head spaces */ + while (g_ascii_isspace (*p) && p < start + len) { + p ++; + start ++; + len --; + has_spaces = TRUE; + } - if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { - /* Strip spaces from the url component */ - p = comp->start; + if (comp) { + comp->start = p; + comp->len = len; + } - while (g_ascii_isspace (*p) && p < comp->start + comp->len) { - p ++; - has_spaces = TRUE; - } + /* Trailing spaces */ + p = start + len - 1; - comp->start = p; - comp->len -= p - comp->start; + while (g_ascii_isspace (*p) && p >= start) { + p --; + len --; - p = comp->start + comp->len - 1; + if (comp) { + comp->len --; + } + has_spaces = TRUE; + } - while (g_ascii_isspace (*p) && p >= comp->start) { - p --; - comp->len --; - has_spaces = TRUE; - } + /* Also we need to perform url decode */ + decoded = rspamd_mempool_alloc (pool, len + 1); + rspamd_strlcpy (decoded, start, len + 1); + decoded_len = rspamd_decode_url (decoded, start, len); - /* Also we need to perform url decode */ - decoded = rspamd_mempool_alloc (pool, comp->len + 1); - rspamd_strlcpy (decoded, comp->start, comp->len + 1); - decoded_len = rspamd_decode_url (decoded, comp->start, comp->len); + if (comp) { + comp->start = decoded; + comp->len = decoded_len; + } - url = rspamd_mempool_alloc (pool, sizeof (*url)); - rc = rspamd_url_parse (url, decoded, decoded_len, pool); + url = rspamd_mempool_alloc (pool, sizeof (*url)); + rc = rspamd_url_parse (url, decoded, decoded_len, pool); - if (rc == URI_ERRNO_OK) { + if (rc == URI_ERRNO_OK) { - /* Spaces in href usually mean an attempt to obfusicate URL */ - if (has_spaces) { - url->flags |= RSPAMD_URL_FLAG_OBSCURED; - } + /* Spaces in href usually mean an attempt to obfuscate URL */ + if (has_spaces) { + url->flags |= RSPAMD_URL_FLAG_OBSCURED; + } - return url; - } + return url; + } + + return NULL; +} + +static struct rspamd_url * +rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag) +{ + struct html_tag_component *comp; + GList *cur; + + cur = tag->params->head; + + while (cur) { + comp = cur->data; + + if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { + return rspamd_html_process_url (pool, comp->start, comp->len, comp); } cur = g_list_next (cur); @@ -1971,7 +1996,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, save_space = FALSE; } - if (cur_tag->id == Tag_A) { + if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) { if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) { url = rspamd_html_process_url_tag (pool, cur_tag); @@ -2007,7 +2032,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, href_offset = dest->len; } } - else if (cur_tag->flags & FL_CLOSING) { + else if (cur_tag->id == Tag_A && + (cur_tag->flags & FL_CLOSING)) { /* Insert exception */ if (url != NULL && (gint)dest->len > href_offset) { rspamd_html_url_is_phished (pool, url, @@ -2028,7 +2054,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, url = NULL; } } - else if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { + + if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { rspamd_html_process_img_tag (pool, cur_tag, hc); } else if (!(cur_tag->flags & FL_CLOSING) && diff --git a/src/libserver/html.h b/src/libserver/html.h index 3fe166961..c16e7b040 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -112,4 +112,16 @@ GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool, */ gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname); +/** + * Extract URL from HTML tag component and sets component elements if needed + * @param pool + * @param start + * @param len + * @param comp + * @return + */ +struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool, + const gchar *start, guint len, + struct html_tag_component *comp); + #endif diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c index 7572a0df8..8942f9ec5 100644 --- a/src/plugins/surbl.c +++ b/src/plugins/surbl.c @@ -48,6 +48,7 @@ #include "rspamd.h" #include "surbl.h" #include "utlist.h" +#include "libserver/html.h" #include "unix-std.h" static struct surbl_ctx *surbl_module_ctx = NULL; @@ -410,6 +411,15 @@ surbl_module_init (struct rspamd_config *cfg, struct module_ctx **ctx) 0, NULL, 0); + rspamd_rcl_add_doc_by_path (cfg, + "surbl.rule", + "Check images URLs with this URL list", + "images", + UCL_BOOLEAN, + NULL, + 0, + NULL, + 0); rspamd_rcl_add_doc_by_path (cfg, "surbl.rule", "Parse IP bits in DNS reply, the content is 'symbol = '", @@ -631,6 +641,7 @@ surbl_module_config (struct rspamd_config *cfg) new_suffix->options |= SURBL_OPTION_NOIP; } } + cur = ucl_obj_get_key (cur_rule, "resolve_ip"); if (cur != NULL && cur->type == UCL_BOOLEAN) { if (ucl_object_toboolean (cur)) { @@ -638,6 +649,13 @@ surbl_module_config (struct rspamd_config *cfg) } } + cur = ucl_obj_get_key (cur_rule, "images"); + if (cur != NULL && cur->type == UCL_BOOLEAN) { + if (ucl_object_toboolean (cur)) { + new_suffix->options |= SURBL_OPTION_CHECKIMAGES; + } + } + if ((new_suffix->options & (SURBL_OPTION_RESOLVEIP|SURBL_OPTION_NOIP)) == (SURBL_OPTION_NOIP|SURBL_OPTION_RESOLVEIP)) { /* Mutually exclusive options */ @@ -1425,6 +1443,10 @@ surbl_test_url (struct rspamd_task *task, void *user_data) { struct redirector_param param; struct suffix_item *suffix = user_data; + guint i, j; + struct mime_text_part *part; + struct html_image *img; + struct rspamd_url *url; param.task = task; param.suffix = suffix; @@ -1433,4 +1455,29 @@ surbl_test_url (struct rspamd_task *task, void *user_data) (rspamd_mempool_destruct_t)g_hash_table_unref, param.tree); g_hash_table_foreach (task->urls, surbl_tree_url_callback, ¶m); + + /* We also need to check and process img URLs */ + if (suffix->options & SURBL_OPTION_CHECKIMAGES) { + for (i = 0; i < task->text_parts->len; i ++) { + part = g_ptr_array_index (task->text_parts, i); + + if (part->html && part->html->images) { + for (j = 0; j < part->html->images->len; j ++) { + img = g_ptr_array_index (part->html->images, j); + + if ((img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL) + && img->src) { + url = rspamd_html_process_url (task->task_pool, + img->src, strlen (img->src), NULL); + + if (url) { + surbl_tree_url_callback (url, url, ¶m); + msg_debug_task ("checked image url %s over %s", + img->src, suffix->suffix); + } + } + } + } + } + } } diff --git a/src/plugins/surbl.h b/src/plugins/surbl.h index 2477032b9..68b27c3f0 100644 --- a/src/plugins/surbl.h +++ b/src/plugins/surbl.h @@ -14,6 +14,7 @@ #define DEFAULT_SURBL_SUFFIX "multi.surbl.org" #define SURBL_OPTION_NOIP (1 << 0) #define SURBL_OPTION_RESOLVEIP (1 << 1) +#define SURBL_OPTION_CHECKIMAGES (1 << 2) #define MAX_LEVELS 10 struct surbl_ctx { -- 2.39.5