Browse Source

Allow processing images urls for SURBL

tags/1.1.0
Vsevolod Stakhov 8 years ago
parent
commit
b22485f657
5 changed files with 150 additions and 39 deletions
  1. 24
    0
      doc/markdown/modules/surbl.md
  2. 66
    39
      src/libserver/html.c
  3. 12
    0
      src/libserver/html.h
  4. 47
    0
      src/plugins/surbl.c
  5. 1
    0
      src/plugins/surbl.h

+ 24
- 0
doc/markdown/modules/surbl.md View File

@@ -46,6 +46,8 @@ surbl {
}
rule {
suffix = "uribl.rambler.ru";
# Also check images
images = true;
symbol = "RAMBLER_URIBL";
}
rule {
@@ -77,6 +79,28 @@ In general, the configuration of `surbl` module is definition of DNS lists. Each
list must have suffix that defines the list itself and optionally for some lists
it is possible to specify either `bit` or `ips` sections.

Since some URL lists do not accept `IP` addresses, it is also possible to disable sending of URLs with IP address in the host to such lists. That could be done by specifying `noip = true` option:

~~~nginx
rule {
suffix = "dbl.spamhaus.org";
symbol = "DBL";
# Do not check numeric URL's
noip = true;
}
~~~

It is also possible to check HTML images URLs using URL blacklists. Just specify `images = true` for such list and you are done:

~~~nginx
rule {
suffix = "uribl.rambler.ru";
# Also check images
images = true;
symbol = "RAMBLER_URIBL";
}
~~~

## Principles of operation

In this section, we define how `surbl` module performs its checks.

+ 66
- 39
src/libserver/html.c View File

@@ -1278,60 +1278,85 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
*statep = state;
}

static struct rspamd_url *
rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
struct rspamd_url *
rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
struct html_tag_component *comp)
{
struct html_tag_component *comp;
struct rspamd_url *url;
GList *cur;
const guchar *p;
gchar *decoded;
gint rc;
gsize decoded_len;
gboolean has_spaces = FALSE;
const gchar *p;

cur = tag->params->head;
p = start;

while (cur) {
comp = cur->data;
/* Strip spaces from the url */
/* Head spaces */
while (g_ascii_isspace (*p) && p < start + len) {
p ++;
start ++;
len --;
has_spaces = TRUE;
}

if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
/* Strip spaces from the url component */
p = comp->start;
if (comp) {
comp->start = p;
comp->len = len;
}

while (g_ascii_isspace (*p) && p < comp->start + comp->len) {
p ++;
has_spaces = TRUE;
}
/* Trailing spaces */
p = start + len - 1;

comp->start = p;
comp->len -= p - comp->start;
while (g_ascii_isspace (*p) && p >= start) {
p --;
len --;

p = comp->start + comp->len - 1;
if (comp) {
comp->len --;
}
has_spaces = TRUE;
}

while (g_ascii_isspace (*p) && p >= comp->start) {
p --;
comp->len --;
has_spaces = TRUE;
}
/* Also we need to perform url decode */
decoded = rspamd_mempool_alloc (pool, len + 1);
rspamd_strlcpy (decoded, start, len + 1);
decoded_len = rspamd_decode_url (decoded, start, len);

/* Also we need to perform url decode */
decoded = rspamd_mempool_alloc (pool, comp->len + 1);
rspamd_strlcpy (decoded, comp->start, comp->len + 1);
decoded_len = rspamd_decode_url (decoded, comp->start, comp->len);
if (comp) {
comp->start = decoded;
comp->len = decoded_len;
}

url = rspamd_mempool_alloc (pool, sizeof (*url));
rc = rspamd_url_parse (url, decoded, decoded_len, pool);
url = rspamd_mempool_alloc (pool, sizeof (*url));
rc = rspamd_url_parse (url, decoded, decoded_len, pool);

if (rc == URI_ERRNO_OK) {
if (rc == URI_ERRNO_OK) {

/* Spaces in href usually mean an attempt to obfusicate URL */
if (has_spaces) {
url->flags |= RSPAMD_URL_FLAG_OBSCURED;
}
/* Spaces in href usually mean an attempt to obfuscate URL */
if (has_spaces) {
url->flags |= RSPAMD_URL_FLAG_OBSCURED;
}

return url;
}
return url;
}

return NULL;
}

static struct rspamd_url *
rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
{
struct html_tag_component *comp;
GList *cur;

cur = tag->params->head;

while (cur) {
comp = cur->data;

if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
return rspamd_html_process_url (pool, comp->start, comp->len, comp);
}

cur = g_list_next (cur);
@@ -1971,7 +1996,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
save_space = FALSE;
}

if (cur_tag->id == Tag_A) {
if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) {
if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
url = rspamd_html_process_url_tag (pool, cur_tag);

@@ -2007,7 +2032,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
href_offset = dest->len;
}
}
else if (cur_tag->flags & FL_CLOSING) {
else if (cur_tag->id == Tag_A &&
(cur_tag->flags & FL_CLOSING)) {
/* Insert exception */
if (url != NULL && (gint)dest->len > href_offset) {
rspamd_html_url_is_phished (pool, url,
@@ -2028,7 +2054,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
url = NULL;
}
}
else if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {

if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
rspamd_html_process_img_tag (pool, cur_tag, hc);
}
else if (!(cur_tag->flags & FL_CLOSING) &&

+ 12
- 0
src/libserver/html.h View File

@@ -112,4 +112,16 @@ GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool,
*/
gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);

/**
* Extract URL from HTML tag component and sets component elements if needed
* @param pool
* @param start
* @param len
* @param comp
* @return
*/
struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool,
const gchar *start, guint len,
struct html_tag_component *comp);

#endif

+ 47
- 0
src/plugins/surbl.c View File

@@ -48,6 +48,7 @@
#include "rspamd.h"
#include "surbl.h"
#include "utlist.h"
#include "libserver/html.h"
#include "unix-std.h"

static struct surbl_ctx *surbl_module_ctx = NULL;
@@ -410,6 +411,15 @@ surbl_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
0,
NULL,
0);
rspamd_rcl_add_doc_by_path (cfg,
"surbl.rule",
"Check images URLs with this URL list",
"images",
UCL_BOOLEAN,
NULL,
0,
NULL,
0);
rspamd_rcl_add_doc_by_path (cfg,
"surbl.rule",
"Parse IP bits in DNS reply, the content is 'symbol = <bit>'",
@@ -631,6 +641,7 @@ surbl_module_config (struct rspamd_config *cfg)
new_suffix->options |= SURBL_OPTION_NOIP;
}
}

cur = ucl_obj_get_key (cur_rule, "resolve_ip");
if (cur != NULL && cur->type == UCL_BOOLEAN) {
if (ucl_object_toboolean (cur)) {
@@ -638,6 +649,13 @@ surbl_module_config (struct rspamd_config *cfg)
}
}

cur = ucl_obj_get_key (cur_rule, "images");
if (cur != NULL && cur->type == UCL_BOOLEAN) {
if (ucl_object_toboolean (cur)) {
new_suffix->options |= SURBL_OPTION_CHECKIMAGES;
}
}

if ((new_suffix->options & (SURBL_OPTION_RESOLVEIP|SURBL_OPTION_NOIP)) ==
(SURBL_OPTION_NOIP|SURBL_OPTION_RESOLVEIP)) {
/* Mutually exclusive options */
@@ -1425,6 +1443,10 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
{
struct redirector_param param;
struct suffix_item *suffix = user_data;
guint i, j;
struct mime_text_part *part;
struct html_image *img;
struct rspamd_url *url;

param.task = task;
param.suffix = suffix;
@@ -1433,4 +1455,29 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
(rspamd_mempool_destruct_t)g_hash_table_unref,
param.tree);
g_hash_table_foreach (task->urls, surbl_tree_url_callback, &param);

/* We also need to check and process img URLs */
if (suffix->options & SURBL_OPTION_CHECKIMAGES) {
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);

if (part->html && part->html->images) {
for (j = 0; j < part->html->images->len; j ++) {
img = g_ptr_array_index (part->html->images, j);

if ((img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL)
&& img->src) {
url = rspamd_html_process_url (task->task_pool,
img->src, strlen (img->src), NULL);

if (url) {
surbl_tree_url_callback (url, url, &param);
msg_debug_task ("checked image url %s over %s",
img->src, suffix->suffix);
}
}
}
}
}
}
}

+ 1
- 0
src/plugins/surbl.h View File

@@ -14,6 +14,7 @@
#define DEFAULT_SURBL_SUFFIX "multi.surbl.org"
#define SURBL_OPTION_NOIP (1 << 0)
#define SURBL_OPTION_RESOLVEIP (1 << 1)
#define SURBL_OPTION_CHECKIMAGES (1 << 2)
#define MAX_LEVELS 10

struct surbl_ctx {

Loading…
Cancel
Save