@@ -46,6 +46,8 @@ surbl { | |||
} | |||
rule { | |||
suffix = "uribl.rambler.ru"; | |||
# Also check images | |||
images = true; | |||
symbol = "RAMBLER_URIBL"; | |||
} | |||
rule { | |||
@@ -77,6 +79,28 @@ In general, the configuration of `surbl` module is definition of DNS lists. Each | |||
list must have suffix that defines the list itself and optionally for some lists | |||
it is possible to specify either `bit` or `ips` sections. | |||
Since some URL lists do not accept `IP` addresses, it is also possible to disable sending of URLs with IP address in the host to such lists. That could be done by specifying `noip = true` option: | |||
~~~nginx | |||
rule { | |||
suffix = "dbl.spamhaus.org"; | |||
symbol = "DBL"; | |||
# Do not check numeric URL's | |||
noip = true; | |||
} | |||
~~~ | |||
It is also possible to check HTML images URLs using URL blacklists. Just specify `images = true` for such list and you are done: | |||
~~~nginx | |||
rule { | |||
suffix = "uribl.rambler.ru"; | |||
# Also check images | |||
images = true; | |||
symbol = "RAMBLER_URIBL"; | |||
} | |||
~~~ | |||
## Principles of operation | |||
In this section, we define how `surbl` module performs its checks. |
@@ -1278,60 +1278,85 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool, | |||
*statep = state; | |||
} | |||
static struct rspamd_url * | |||
rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag) | |||
struct rspamd_url * | |||
rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, | |||
struct html_tag_component *comp) | |||
{ | |||
struct html_tag_component *comp; | |||
struct rspamd_url *url; | |||
GList *cur; | |||
const guchar *p; | |||
gchar *decoded; | |||
gint rc; | |||
gsize decoded_len; | |||
gboolean has_spaces = FALSE; | |||
const gchar *p; | |||
cur = tag->params->head; | |||
p = start; | |||
while (cur) { | |||
comp = cur->data; | |||
/* Strip spaces from the url */ | |||
/* Head spaces */ | |||
while (g_ascii_isspace (*p) && p < start + len) { | |||
p ++; | |||
start ++; | |||
len --; | |||
has_spaces = TRUE; | |||
} | |||
if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { | |||
/* Strip spaces from the url component */ | |||
p = comp->start; | |||
if (comp) { | |||
comp->start = p; | |||
comp->len = len; | |||
} | |||
while (g_ascii_isspace (*p) && p < comp->start + comp->len) { | |||
p ++; | |||
has_spaces = TRUE; | |||
} | |||
/* Trailing spaces */ | |||
p = start + len - 1; | |||
comp->start = p; | |||
comp->len -= p - comp->start; | |||
while (g_ascii_isspace (*p) && p >= start) { | |||
p --; | |||
len --; | |||
p = comp->start + comp->len - 1; | |||
if (comp) { | |||
comp->len --; | |||
} | |||
has_spaces = TRUE; | |||
} | |||
while (g_ascii_isspace (*p) && p >= comp->start) { | |||
p --; | |||
comp->len --; | |||
has_spaces = TRUE; | |||
} | |||
/* Also we need to perform url decode */ | |||
decoded = rspamd_mempool_alloc (pool, len + 1); | |||
rspamd_strlcpy (decoded, start, len + 1); | |||
decoded_len = rspamd_decode_url (decoded, start, len); | |||
/* Also we need to perform url decode */ | |||
decoded = rspamd_mempool_alloc (pool, comp->len + 1); | |||
rspamd_strlcpy (decoded, comp->start, comp->len + 1); | |||
decoded_len = rspamd_decode_url (decoded, comp->start, comp->len); | |||
if (comp) { | |||
comp->start = decoded; | |||
comp->len = decoded_len; | |||
} | |||
url = rspamd_mempool_alloc (pool, sizeof (*url)); | |||
rc = rspamd_url_parse (url, decoded, decoded_len, pool); | |||
url = rspamd_mempool_alloc (pool, sizeof (*url)); | |||
rc = rspamd_url_parse (url, decoded, decoded_len, pool); | |||
if (rc == URI_ERRNO_OK) { | |||
if (rc == URI_ERRNO_OK) { | |||
/* Spaces in href usually mean an attempt to obfusicate URL */ | |||
if (has_spaces) { | |||
url->flags |= RSPAMD_URL_FLAG_OBSCURED; | |||
} | |||
/* Spaces in href usually mean an attempt to obfuscate URL */ | |||
if (has_spaces) { | |||
url->flags |= RSPAMD_URL_FLAG_OBSCURED; | |||
} | |||
return url; | |||
} | |||
return url; | |||
} | |||
return NULL; | |||
} | |||
static struct rspamd_url * | |||
rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag) | |||
{ | |||
struct html_tag_component *comp; | |||
GList *cur; | |||
cur = tag->params->head; | |||
while (cur) { | |||
comp = cur->data; | |||
if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) { | |||
return rspamd_html_process_url (pool, comp->start, comp->len, comp); | |||
} | |||
cur = g_list_next (cur); | |||
@@ -1971,7 +1996,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, | |||
save_space = FALSE; | |||
} | |||
if (cur_tag->id == Tag_A) { | |||
if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) { | |||
if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) { | |||
url = rspamd_html_process_url_tag (pool, cur_tag); | |||
@@ -2007,7 +2032,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, | |||
href_offset = dest->len; | |||
} | |||
} | |||
else if (cur_tag->flags & FL_CLOSING) { | |||
else if (cur_tag->id == Tag_A && | |||
(cur_tag->flags & FL_CLOSING)) { | |||
/* Insert exception */ | |||
if (url != NULL && (gint)dest->len > href_offset) { | |||
rspamd_html_url_is_phished (pool, url, | |||
@@ -2028,7 +2054,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, | |||
url = NULL; | |||
} | |||
} | |||
else if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { | |||
if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { | |||
rspamd_html_process_img_tag (pool, cur_tag, hc); | |||
} | |||
else if (!(cur_tag->flags & FL_CLOSING) && |
@@ -112,4 +112,16 @@ GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool, | |||
*/ | |||
gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname); | |||
/** | |||
* Extract URL from HTML tag component and sets component elements if needed | |||
* @param pool | |||
* @param start | |||
* @param len | |||
* @param comp | |||
* @return | |||
*/ | |||
struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool, | |||
const gchar *start, guint len, | |||
struct html_tag_component *comp); | |||
#endif |
@@ -48,6 +48,7 @@ | |||
#include "rspamd.h" | |||
#include "surbl.h" | |||
#include "utlist.h" | |||
#include "libserver/html.h" | |||
#include "unix-std.h" | |||
static struct surbl_ctx *surbl_module_ctx = NULL; | |||
@@ -410,6 +411,15 @@ surbl_module_init (struct rspamd_config *cfg, struct module_ctx **ctx) | |||
0, | |||
NULL, | |||
0); | |||
rspamd_rcl_add_doc_by_path (cfg, | |||
"surbl.rule", | |||
"Check images URLs with this URL list", | |||
"images", | |||
UCL_BOOLEAN, | |||
NULL, | |||
0, | |||
NULL, | |||
0); | |||
rspamd_rcl_add_doc_by_path (cfg, | |||
"surbl.rule", | |||
"Parse IP bits in DNS reply, the content is 'symbol = <bit>'", | |||
@@ -631,6 +641,7 @@ surbl_module_config (struct rspamd_config *cfg) | |||
new_suffix->options |= SURBL_OPTION_NOIP; | |||
} | |||
} | |||
cur = ucl_obj_get_key (cur_rule, "resolve_ip"); | |||
if (cur != NULL && cur->type == UCL_BOOLEAN) { | |||
if (ucl_object_toboolean (cur)) { | |||
@@ -638,6 +649,13 @@ surbl_module_config (struct rspamd_config *cfg) | |||
} | |||
} | |||
cur = ucl_obj_get_key (cur_rule, "images"); | |||
if (cur != NULL && cur->type == UCL_BOOLEAN) { | |||
if (ucl_object_toboolean (cur)) { | |||
new_suffix->options |= SURBL_OPTION_CHECKIMAGES; | |||
} | |||
} | |||
if ((new_suffix->options & (SURBL_OPTION_RESOLVEIP|SURBL_OPTION_NOIP)) == | |||
(SURBL_OPTION_NOIP|SURBL_OPTION_RESOLVEIP)) { | |||
/* Mutually exclusive options */ | |||
@@ -1425,6 +1443,10 @@ surbl_test_url (struct rspamd_task *task, void *user_data) | |||
{ | |||
struct redirector_param param; | |||
struct suffix_item *suffix = user_data; | |||
guint i, j; | |||
struct mime_text_part *part; | |||
struct html_image *img; | |||
struct rspamd_url *url; | |||
param.task = task; | |||
param.suffix = suffix; | |||
@@ -1433,4 +1455,29 @@ surbl_test_url (struct rspamd_task *task, void *user_data) | |||
(rspamd_mempool_destruct_t)g_hash_table_unref, | |||
param.tree); | |||
g_hash_table_foreach (task->urls, surbl_tree_url_callback, ¶m); | |||
/* We also need to check and process img URLs */ | |||
if (suffix->options & SURBL_OPTION_CHECKIMAGES) { | |||
for (i = 0; i < task->text_parts->len; i ++) { | |||
part = g_ptr_array_index (task->text_parts, i); | |||
if (part->html && part->html->images) { | |||
for (j = 0; j < part->html->images->len; j ++) { | |||
img = g_ptr_array_index (part->html->images, j); | |||
if ((img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL) | |||
&& img->src) { | |||
url = rspamd_html_process_url (task->task_pool, | |||
img->src, strlen (img->src), NULL); | |||
if (url) { | |||
surbl_tree_url_callback (url, url, ¶m); | |||
msg_debug_task ("checked image url %s over %s", | |||
img->src, suffix->suffix); | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} |
@@ -14,6 +14,7 @@ | |||
#define DEFAULT_SURBL_SUFFIX "multi.surbl.org" | |||
#define SURBL_OPTION_NOIP (1 << 0) | |||
#define SURBL_OPTION_RESOLVEIP (1 << 1) | |||
#define SURBL_OPTION_CHECKIMAGES (1 << 2) | |||
#define MAX_LEVELS 10 | |||
struct surbl_ctx { |