8 years ago · b22485f657
--- a/doc/markdown/modules/surbl.md
+++ b/doc/markdown/modules/surbl.md
@@ -46,6 +46,8 @@ surbl {
    }
    rule {
        suffix = "uribl.rambler.ru";
        # Also check images
        images = true;
        symbol = "RAMBLER_URIBL";
    }
    rule {
@@ -77,6 +79,28 @@ In general, the configuration of `surbl` module is definition of DNS lists. Each
 list must have suffix that defines the list itself and optionally for some lists
 it is possible to specify either `bit` or `ips` sections.

 Since some URL lists do not accept `IP` addresses, it is also possible to disable sending of URLs with IP address in the host to such lists. That could be done by specifying `noip = true` option:

 ~~~nginx
    rule {
        suffix = "dbl.spamhaus.org";
        symbol = "DBL";
        # Do not check numeric URL's
        noip = true;
    }
 ~~~

 It is also possible to check HTML images URLs using URL blacklists. Just specify `images = true` for such list and you are done:

 ~~~nginx
    rule {
        suffix = "uribl.rambler.ru";
        # Also check images
        images = true;
        symbol = "RAMBLER_URIBL";
    }
 ~~~

 ## Principles of operation

 In this section, we define how `surbl` module performs its checks.
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1278,60 +1278,85 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
 	*statep = state;
 }

 static struct rspamd_url *
 rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
 struct rspamd_url *
 rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
 		struct html_tag_component *comp)
 {
 	struct html_tag_component *comp;
 	struct rspamd_url *url;
 	GList *cur;
 	const guchar *p;
 	gchar *decoded;
 	gint rc;
 	gsize decoded_len;
 	gboolean has_spaces = FALSE;
 	const gchar *p;

 	cur = tag->params->head;
 	p = start;

 	while (cur) {
 		comp = cur->data;
 	/* Strip spaces from the url */
 	/* Head spaces */
 	while (g_ascii_isspace (*p) && p < start + len) {
 		p ++;
 		start ++;
 		len --;
 		has_spaces = TRUE;
 	}

 		if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
 			/* Strip spaces from the url component */
 			p = comp->start;
 	if (comp) {
 		comp->start = p;
 		comp->len = len;
 	}

 			while (g_ascii_isspace (*p) && p < comp->start + comp->len) {
 				p ++;
 				has_spaces = TRUE;
 			}
 	/* Trailing spaces */
 	p = start + len - 1;

 			comp->start = p;
 			comp->len -= p - comp->start;
 	while (g_ascii_isspace (*p) && p >= start) {
 		p --;
 		len --;

 			p = comp->start + comp->len - 1;
 		if (comp) {
 			comp->len --;
 		}
 		has_spaces = TRUE;
 	}

 			while (g_ascii_isspace (*p) && p >= comp->start) {
 				p --;
 				comp->len --;
 				has_spaces = TRUE;
 			}
 	/* Also we need to perform url decode */
 	decoded = rspamd_mempool_alloc (pool, len + 1);
 	rspamd_strlcpy (decoded, start, len + 1);
 	decoded_len = rspamd_decode_url (decoded, start, len);

 			/* Also we need to perform url decode */
 			decoded = rspamd_mempool_alloc (pool, comp->len + 1);
 			rspamd_strlcpy (decoded, comp->start, comp->len + 1);
 			decoded_len = rspamd_decode_url (decoded, comp->start, comp->len);
 	if (comp) {
 		comp->start = decoded;
 		comp->len = decoded_len;
 	}

 			url = rspamd_mempool_alloc (pool, sizeof (*url));
 			rc = rspamd_url_parse (url, decoded, decoded_len, pool);
 	url = rspamd_mempool_alloc (pool, sizeof (*url));
 	rc = rspamd_url_parse (url, decoded, decoded_len, pool);

 			if (rc == URI_ERRNO_OK) {
 	if (rc == URI_ERRNO_OK) {

 				/* Spaces in href usually mean an attempt to obfusicate URL */
 				if (has_spaces) {
 					url->flags |= RSPAMD_URL_FLAG_OBSCURED;
 				}
 		/* Spaces in href usually mean an attempt to obfuscate URL */
 		if (has_spaces) {
 			url->flags |= RSPAMD_URL_FLAG_OBSCURED;
 		}

 				return url;
 			}
 		return url;
 	}

 	return NULL;
 }

 static struct rspamd_url *
 rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
 {
 	struct html_tag_component *comp;
 	GList *cur;

 	cur = tag->params->head;

 	while (cur) {
 		comp = cur->data;

 		if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
 			return rspamd_html_process_url (pool, comp->start, comp->len, comp);
 		}

 		cur = g_list_next (cur);
@@ -1971,7 +1996,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 					save_space = FALSE;
 				}

 				if (cur_tag->id == Tag_A) {
 				if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) {
 					if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
 						url = rspamd_html_process_url_tag (pool, cur_tag);

@@ -2007,7 +2032,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 							href_offset = dest->len;
 						}
 					}
 					else if (cur_tag->flags & FL_CLOSING) {
 					else if (cur_tag->id == Tag_A &&
 							(cur_tag->flags & FL_CLOSING)) {
 						/* Insert exception */
 						if (url != NULL && (gint)dest->len > href_offset) {
 							rspamd_html_url_is_phished (pool, url,
@@ -2028,7 +2054,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 						url = NULL;
 					}
 				}
 				else if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {

 				if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
 					rspamd_html_process_img_tag (pool, cur_tag, hc);
 				}
 				else if (!(cur_tag->flags & FL_CLOSING) &&
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -112,4 +112,16 @@ GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool,
 */
 gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);

 /**
 * Extract URL from HTML tag component and sets component elements if needed
 * @param pool
 * @param start
 * @param len
 * @param comp
 * @return
 */
 struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool,
 		const gchar *start, guint len,
 		struct html_tag_component *comp);

 #endif
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -48,6 +48,7 @@
 #include "rspamd.h"
 #include "surbl.h"
 #include "utlist.h"
 #include "libserver/html.h"
 #include "unix-std.h"

 static struct surbl_ctx *surbl_module_ctx = NULL;
@@ -410,6 +411,15 @@ surbl_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
 			0,
 			NULL,
 			0);
 	rspamd_rcl_add_doc_by_path (cfg,
 			"surbl.rule",
 			"Check images URLs with this URL list",
 			"images",
 			UCL_BOOLEAN,
 			NULL,
 			0,
 			NULL,
 			0);
 	rspamd_rcl_add_doc_by_path (cfg,
 			"surbl.rule",
 			"Parse IP bits in DNS reply, the content is 'symbol = <bit>'",
@@ -631,6 +641,7 @@ surbl_module_config (struct rspamd_config *cfg)
 					new_suffix->options |= SURBL_OPTION_NOIP;
 				}
 			}

 			cur = ucl_obj_get_key (cur_rule, "resolve_ip");
 			if (cur != NULL && cur->type == UCL_BOOLEAN) {
 				if (ucl_object_toboolean (cur)) {
@@ -638,6 +649,13 @@ surbl_module_config (struct rspamd_config *cfg)
 				}
 			}

 			cur = ucl_obj_get_key (cur_rule, "images");
 			if (cur != NULL && cur->type == UCL_BOOLEAN) {
 				if (ucl_object_toboolean (cur)) {
 					new_suffix->options |= SURBL_OPTION_CHECKIMAGES;
 				}
 			}

 			if ((new_suffix->options & (SURBL_OPTION_RESOLVEIP|SURBL_OPTION_NOIP)) ==
 					(SURBL_OPTION_NOIP|SURBL_OPTION_RESOLVEIP)) {
 				/* Mutually exclusive options */
@@ -1425,6 +1443,10 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
 {
 	struct redirector_param param;
 	struct suffix_item *suffix = user_data;
 	guint i, j;
 	struct mime_text_part *part;
 	struct html_image *img;
 	struct rspamd_url *url;

 	param.task = task;
 	param.suffix = suffix;
@@ -1433,4 +1455,29 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
 		(rspamd_mempool_destruct_t)g_hash_table_unref,
 		param.tree);
 	g_hash_table_foreach (task->urls, surbl_tree_url_callback, &param);

 	/* We also need to check and process img URLs */
 	if (suffix->options & SURBL_OPTION_CHECKIMAGES) {
 		for (i = 0; i < task->text_parts->len; i ++) {
 			part = g_ptr_array_index (task->text_parts, i);

 			if (part->html && part->html->images) {
 				for (j = 0; j < part->html->images->len; j ++) {
 					img = g_ptr_array_index (part->html->images, j);

 					if ((img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL)
 							&& img->src) {
 						url = rspamd_html_process_url (task->task_pool,
 								img->src, strlen (img->src), NULL);

 						if (url) {
 							surbl_tree_url_callback (url, url, &param);
 							msg_debug_task ("checked image url %s over %s",
 									img->src, suffix->suffix);
 						}
 					}
 				}
 			}
 		}
 	}
 }
--- a/src/plugins/surbl.h
+++ b/src/plugins/surbl.h
@@ -14,6 +14,7 @@
 #define DEFAULT_SURBL_SUFFIX "multi.surbl.org"
 #define SURBL_OPTION_NOIP (1 << 0)
 #define SURBL_OPTION_RESOLVEIP (1 << 1)
 #define SURBL_OPTION_CHECKIMAGES (1 << 2)
 #define MAX_LEVELS 10

 struct surbl_ctx {