aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-01-14 13:30:55 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-01-14 13:30:55 +0000
commitb22485f657c40c9b9fda9675d25c9294288c5732 (patch)
tree2ade5a2bfb8d39c7c1379dcc3ff72bb6b27c4188 /src
parente0fa8bf017a672d178f9024eeb46ce81eeaf188c (diff)
downloadrspamd-b22485f657c40c9b9fda9675d25c9294288c5732.tar.gz
rspamd-b22485f657c40c9b9fda9675d25c9294288c5732.zip
Allow processing images urls for SURBL
Diffstat (limited to 'src')
-rw-r--r--src/libserver/html.c105
-rw-r--r--src/libserver/html.h12
-rw-r--r--src/plugins/surbl.c47
-rw-r--r--src/plugins/surbl.h1
4 files changed, 126 insertions, 39 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 29922b133..5c55d6f30 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -1278,60 +1278,85 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool,
*statep = state;
}
-static struct rspamd_url *
-rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
+struct rspamd_url *
+rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
+ struct html_tag_component *comp)
{
- struct html_tag_component *comp;
struct rspamd_url *url;
- GList *cur;
- const guchar *p;
gchar *decoded;
gint rc;
gsize decoded_len;
gboolean has_spaces = FALSE;
+ const gchar *p;
- cur = tag->params->head;
+ p = start;
- while (cur) {
- comp = cur->data;
+ /* Strip spaces from the url */
+ /* Head spaces */
+ while (g_ascii_isspace (*p) && p < start + len) {
+ p ++;
+ start ++;
+ len --;
+ has_spaces = TRUE;
+ }
- if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
- /* Strip spaces from the url component */
- p = comp->start;
+ if (comp) {
+ comp->start = p;
+ comp->len = len;
+ }
- while (g_ascii_isspace (*p) && p < comp->start + comp->len) {
- p ++;
- has_spaces = TRUE;
- }
+ /* Trailing spaces */
+ p = start + len - 1;
- comp->start = p;
- comp->len -= p - comp->start;
+ while (g_ascii_isspace (*p) && p >= start) {
+ p --;
+ len --;
- p = comp->start + comp->len - 1;
+ if (comp) {
+ comp->len --;
+ }
+ has_spaces = TRUE;
+ }
- while (g_ascii_isspace (*p) && p >= comp->start) {
- p --;
- comp->len --;
- has_spaces = TRUE;
- }
+ /* Also we need to perform url decode */
+ decoded = rspamd_mempool_alloc (pool, len + 1);
+ rspamd_strlcpy (decoded, start, len + 1);
+ decoded_len = rspamd_decode_url (decoded, start, len);
- /* Also we need to perform url decode */
- decoded = rspamd_mempool_alloc (pool, comp->len + 1);
- rspamd_strlcpy (decoded, comp->start, comp->len + 1);
- decoded_len = rspamd_decode_url (decoded, comp->start, comp->len);
+ if (comp) {
+ comp->start = decoded;
+ comp->len = decoded_len;
+ }
- url = rspamd_mempool_alloc (pool, sizeof (*url));
- rc = rspamd_url_parse (url, decoded, decoded_len, pool);
+ url = rspamd_mempool_alloc (pool, sizeof (*url));
+ rc = rspamd_url_parse (url, decoded, decoded_len, pool);
- if (rc == URI_ERRNO_OK) {
+ if (rc == URI_ERRNO_OK) {
- /* Spaces in href usually mean an attempt to obfusicate URL */
- if (has_spaces) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
+ /* Spaces in href usually mean an attempt to obfuscate URL */
+ if (has_spaces) {
+ url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+ }
- return url;
- }
+ return url;
+ }
+
+ return NULL;
+}
+
+static struct rspamd_url *
+rspamd_html_process_url_tag (rspamd_mempool_t *pool, struct html_tag *tag)
+{
+ struct html_tag_component *comp;
+ GList *cur;
+
+ cur = tag->params->head;
+
+ while (cur) {
+ comp = cur->data;
+
+ if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
+ return rspamd_html_process_url (pool, comp->start, comp->len, comp);
}
cur = g_list_next (cur);
@@ -1971,7 +1996,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
save_space = FALSE;
}
- if (cur_tag->id == Tag_A) {
+ if (cur_tag->id == Tag_A || cur_tag->id == Tag_IFRAME) {
if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
url = rspamd_html_process_url_tag (pool, cur_tag);
@@ -2007,7 +2032,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
href_offset = dest->len;
}
}
- else if (cur_tag->flags & FL_CLOSING) {
+ else if (cur_tag->id == Tag_A &&
+ (cur_tag->flags & FL_CLOSING)) {
/* Insert exception */
if (url != NULL && (gint)dest->len > href_offset) {
rspamd_html_url_is_phished (pool, url,
@@ -2028,7 +2054,8 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
url = NULL;
}
}
- else if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
+
+ if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
rspamd_html_process_img_tag (pool, cur_tag, hc);
}
else if (!(cur_tag->flags & FL_CLOSING) &&
diff --git a/src/libserver/html.h b/src/libserver/html.h
index 3fe166961..c16e7b040 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -112,4 +112,16 @@ GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool,
*/
gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
+/**
+ * Extract URL from HTML tag component and sets component elements if needed
+ * @param pool
+ * @param start
+ * @param len
+ * @param comp
+ * @return
+ */
+struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool,
+ const gchar *start, guint len,
+ struct html_tag_component *comp);
+
#endif
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index 7572a0df8..8942f9ec5 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -48,6 +48,7 @@
#include "rspamd.h"
#include "surbl.h"
#include "utlist.h"
+#include "libserver/html.h"
#include "unix-std.h"
static struct surbl_ctx *surbl_module_ctx = NULL;
@@ -412,6 +413,15 @@ surbl_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
0);
rspamd_rcl_add_doc_by_path (cfg,
"surbl.rule",
+ "Check images URLs with this URL list",
+ "images",
+ UCL_BOOLEAN,
+ NULL,
+ 0,
+ NULL,
+ 0);
+ rspamd_rcl_add_doc_by_path (cfg,
+ "surbl.rule",
"Parse IP bits in DNS reply, the content is 'symbol = <bit>'",
"bits",
UCL_OBJECT,
@@ -631,6 +641,7 @@ surbl_module_config (struct rspamd_config *cfg)
new_suffix->options |= SURBL_OPTION_NOIP;
}
}
+
cur = ucl_obj_get_key (cur_rule, "resolve_ip");
if (cur != NULL && cur->type == UCL_BOOLEAN) {
if (ucl_object_toboolean (cur)) {
@@ -638,6 +649,13 @@ surbl_module_config (struct rspamd_config *cfg)
}
}
+ cur = ucl_obj_get_key (cur_rule, "images");
+ if (cur != NULL && cur->type == UCL_BOOLEAN) {
+ if (ucl_object_toboolean (cur)) {
+ new_suffix->options |= SURBL_OPTION_CHECKIMAGES;
+ }
+ }
+
if ((new_suffix->options & (SURBL_OPTION_RESOLVEIP|SURBL_OPTION_NOIP)) ==
(SURBL_OPTION_NOIP|SURBL_OPTION_RESOLVEIP)) {
/* Mutually exclusive options */
@@ -1425,6 +1443,10 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
{
struct redirector_param param;
struct suffix_item *suffix = user_data;
+ guint i, j;
+ struct mime_text_part *part;
+ struct html_image *img;
+ struct rspamd_url *url;
param.task = task;
param.suffix = suffix;
@@ -1433,4 +1455,29 @@ surbl_test_url (struct rspamd_task *task, void *user_data)
(rspamd_mempool_destruct_t)g_hash_table_unref,
param.tree);
g_hash_table_foreach (task->urls, surbl_tree_url_callback, &param);
+
+ /* We also need to check and process img URLs */
+ if (suffix->options & SURBL_OPTION_CHECKIMAGES) {
+ for (i = 0; i < task->text_parts->len; i ++) {
+ part = g_ptr_array_index (task->text_parts, i);
+
+ if (part->html && part->html->images) {
+ for (j = 0; j < part->html->images->len; j ++) {
+ img = g_ptr_array_index (part->html->images, j);
+
+ if ((img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL)
+ && img->src) {
+ url = rspamd_html_process_url (task->task_pool,
+ img->src, strlen (img->src), NULL);
+
+ if (url) {
+ surbl_tree_url_callback (url, url, &param);
+ msg_debug_task ("checked image url %s over %s",
+ img->src, suffix->suffix);
+ }
+ }
+ }
+ }
+ }
+ }
}
diff --git a/src/plugins/surbl.h b/src/plugins/surbl.h
index 2477032b9..68b27c3f0 100644
--- a/src/plugins/surbl.h
+++ b/src/plugins/surbl.h
@@ -14,6 +14,7 @@
#define DEFAULT_SURBL_SUFFIX "multi.surbl.org"
#define SURBL_OPTION_NOIP (1 << 0)
#define SURBL_OPTION_RESOLVEIP (1 << 1)
+#define SURBL_OPTION_CHECKIMAGES (1 << 2)
#define MAX_LEVELS 10
struct surbl_ctx {