From 6b7622a2ff2110fe1c715278386b9fdad0bedcd0 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sun, 22 Mar 2020 13:08:26 +0000 Subject: [PATCH] [Rework] Urls: Improve query urls handling --- src/libserver/url.c | 67 +++++++++++++++++++++++++-------------------- src/libserver/url.h | 1 + 2 files changed, 38 insertions(+), 30 deletions(-) diff --git a/src/libserver/url.c b/src/libserver/url.c index e0f05c3b0..30872c38d 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -3213,11 +3213,44 @@ struct rspamd_url_mimepart_cbdata { gsize url_len; }; +static gboolean +rspamd_url_query_callback (struct rspamd_url *url, gsize start_offset, + gsize end_offset, gpointer ud) +{ + struct rspamd_url_mimepart_cbdata *cbd = + (struct rspamd_url_mimepart_cbdata *)ud; + struct rspamd_task *task; + + task = cbd->task; + + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen == 0) { + return FALSE; + } + } + /* Also check max urls */ + if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) { + if (kh_size (MESSAGE_FIELD (task, urls)) > cbd->task->cfg->max_urls) { + msg_err_task ("part has too many URLs, we cannot process more: " + "%d urls extracted ", + (guint)kh_size (MESSAGE_FIELD (task, urls))); + + return FALSE; + } + } + + url->flags |= RSPAMD_URL_FLAG_QUERY; + rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url); + + return TRUE; +} + static gboolean rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, gsize end_offset, gpointer ud) { - struct rspamd_url_mimepart_cbdata *cbd = ud; + struct rspamd_url_mimepart_cbdata *cbd = + (struct rspamd_url_mimepart_cbdata *)ud; struct rspamd_process_exception *ex; struct rspamd_task *task; gchar *url_str = NULL; @@ -3270,36 +3303,10 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, /* We also search the query for additional url inside */ if (url->querylen > 0) { - if (rspamd_url_find (task->task_pool, + rspamd_url_find_multiple (task->task_pool, rspamd_url_query_unsafe (url), url->querylen, - &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { - query_url = rspamd_mempool_alloc0 (task->task_pool, - sizeof (struct rspamd_url)); - rc = rspamd_url_parse (query_url, - url_str, - strlen (url_str), - task->task_pool, - RSPAMD_URL_PARSE_TEXT); - - if (rc == URI_ERRNO_OK && - query_url->hostlen > 0) { - msg_debug_task ("found url %s in query of url" - " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url)); - - if (prefix_added) { - query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; - } - - if (query_url->protocol == PROTOCOL_MAILTO) { - if (query_url->userlen == 0) { - return TRUE; - } - } - - query_url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; - rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), query_url); - } - } + RSPAMD_URL_FIND_ALL, NULL, + rspamd_url_query_callback, cbd); } return TRUE; diff --git a/src/libserver/url.h b/src/libserver/url.h index bf8ba4b63..bb9c57399 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -35,6 +35,7 @@ enum rspamd_url_flags { RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u, RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u, RSPAMD_URL_FLAG_IMAGE = 1u << 19u, + RSPAMD_URL_FLAG_QUERY = 1u << 20u, }; struct rspamd_url_tag { -- 2.39.5