From a59ff17c8be4d6ebe6db7b6fc2cebf724c4fd865 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 4 Feb 2019 16:35:21 +0000 Subject: [PATCH] [Regression] Fix urls output in the protocol --- src/libserver/protocol.c | 67 ++++++++++++++++++++++++++++------------ src/libserver/url.c | 48 +++++++++++++++++++++++----- src/libserver/url.h | 3 ++ 3 files changed, 91 insertions(+), 27 deletions(-) diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c index ba468ee5f..5bcfbc37a 100644 --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@ -435,14 +435,17 @@ rspamd_protocol_handle_headers (struct rspamd_task *task, } } IF_HEADER (URLS_HEADER) { + msg_debug_protocol ("read urls header, value: %V", hv); + srch.begin = "extended"; srch.len = 8; - msg_debug_protocol ("read urls header, value: %V", hv); if (rspamd_ftok_casecmp (hv_tok, &srch) == 0) { task->flags |= RSPAMD_TASK_FLAG_EXT_URLS; msg_debug_protocol ("extended urls information"); } + + /* TODO: add more formats there */ } IF_HEADER (USER_AGENT_HEADER) { msg_debug_protocol ("read user-agent header, value: %V", hv); @@ -665,6 +668,7 @@ rspamd_protocol_handle_request (struct rspamd_task *task, /* Structure for writing tree data */ struct tree_cb_data { ucl_object_t *top; + GHashTable *seen; struct rspamd_task *task; }; @@ -715,17 +719,37 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) struct rspamd_url *url = value; ucl_object_t *obj; struct rspamd_task *task = cb->task; - const gchar *user_field = "unknown", *encoded; + const gchar *user_field = "unknown", *encoded = NULL; gboolean has_user = FALSE; guint len = 0; - gsize enclen; - - encoded = rspamd_url_encode (url, &enclen, task->task_pool); + gsize enclen = 0; if (!(task->flags & RSPAMD_TASK_FLAG_EXT_URLS)) { - obj = ucl_object_fromlstring (encoded, enclen); + if (url->hostlen > 0) { + if (g_hash_table_lookup (cb->seen, url)) { + return; + } + + const gchar *end = NULL; + + if (g_utf8_validate (url->host, url->hostlen, &end)) { + obj = ucl_object_fromlstring (url->host, url->hostlen); + } + else if (end - url->host > 0) { + obj = ucl_object_fromlstring (url->host, end - url->host); + } + else { + return; + } + } + else { + return; + } + + g_hash_table_insert (cb->seen, url, url); } else { + encoded = rspamd_url_encode (url, &enclen, task->task_pool); obj = rspamd_protocol_extended_url (task, url, encoded, enclen); } @@ -742,6 +766,10 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) len = task->from_envelope->addr_len; } + if (!encoded) { + encoded = rspamd_url_encode (url, &enclen, task->task_pool); + } + msg_notice_task_encrypted ("<%s> %s: %*s; ip: %s; URL: %*s", task->message_id, has_user ? "user" : "from", @@ -760,9 +788,12 @@ rspamd_urls_tree_ucl (GHashTable *input, struct rspamd_task *task) obj = ucl_object_typed_new (UCL_ARRAY); cb.top = obj; cb.task = task; + cb.seen = g_hash_table_new (rspamd_url_host_hash, rspamd_urls_host_cmp); g_hash_table_foreach (input, urls_protocol_cb, &cb); + g_hash_table_unref (cb.seen); + return obj; } @@ -1168,18 +1199,16 @@ rspamd_protocol_write_ucl (struct rspamd_task *task, } if (flags & RSPAMD_PROTOCOL_URLS) { - if (task->flags & RSPAMD_TASK_FLAG_EXT_URLS) { - if (g_hash_table_size (task->urls) > 0) { - ucl_object_insert_key (top, - rspamd_urls_tree_ucl (task->urls, task), - "urls", 0, false); - } + if (g_hash_table_size (task->urls) > 0) { + ucl_object_insert_key (top, + rspamd_urls_tree_ucl (task->urls, task), + "urls", 0, false); + } - if (g_hash_table_size (task->emails) > 0) { - ucl_object_insert_key (top, - rspamd_emails_tree_ucl (task->emails, task), - "emails", 0, false); - } + if (g_hash_table_size (task->emails) > 0) { + ucl_object_insert_key (top, + rspamd_emails_tree_ucl (task->emails, task), + "emails", 0, false); } } @@ -1279,9 +1308,7 @@ rspamd_protocol_http_reply (struct rspamd_http_message *msg, rspamd_http_message_add_header (msg, hn->begin, hv->begin); } - if (task->cfg->log_urls || (task->flags & RSPAMD_TASK_FLAG_EXT_URLS)) { - flags |= RSPAMD_PROTOCOL_URLS; - } + flags |= RSPAMD_PROTOCOL_URLS; top = rspamd_protocol_write_ucl (task, flags); diff --git a/src/libserver/url.c b/src/libserver/url.c index 4599f3ce1..4dcd11c9e 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2971,15 +2971,26 @@ guint rspamd_url_hash (gconstpointer u) { const struct rspamd_url *url = u; - rspamd_cryptobox_fast_hash_state_t st; - - rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ()); if (url->urllen > 0) { - rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen); + return rspamd_cryptobox_fast_hash (url->string, url->urllen, + rspamd_hash_seed ()); } - return rspamd_cryptobox_fast_hash_final (&st); + return 0; +} + +guint +rspamd_url_host_hash (gconstpointer u) +{ + const struct rspamd_url *url = u; + + if (url->hostlen > 0) { + return rspamd_cryptobox_fast_hash (url->host, url->hostlen, + rspamd_hash_seed ()); + } + + return 0; } guint @@ -3045,6 +3056,22 @@ rspamd_urls_cmp (gconstpointer a, gconstpointer b) return r == 0; } +gboolean +rspamd_urls_host_cmp (gconstpointer a, gconstpointer b) +{ + const struct rspamd_url *u1 = a, *u2 = b; + int r = 0; + + if (u1->hostlen != u2->hostlen) { + return FALSE; + } + else { + r = memcmp (u1->host, u2->host, u1->hostlen); + } + + return r == 0; +} + gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size) { @@ -3255,8 +3282,15 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen, dest = rspamd_mempool_alloc (pool, dlen + 1); d = dest; dend = d + dlen; - d += rspamd_snprintf ((gchar *)d, dend - d, - "%*s://", url->protocollen, rspamd_url_protocols[url->protocol].name); + + if (url->protocollen > 0 && + (url->protocol >= 0 && url->protocol < G_N_ELEMENTS (rspamd_url_protocols))) { + d += rspamd_snprintf ((gchar *) d, dend - d, + "%*s://", url->protocollen, rspamd_url_protocols[url->protocol].name); + } + else { + d += rspamd_snprintf ((gchar *) d, dend - d, "http://"); + } if (url->userlen > 0) { ENCODE_URL_COMPONENT ((guchar *)url->user, url->userlen, diff --git a/src/libserver/url.h b/src/libserver/url.h index fa5c69f00..523fb2c1f 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -203,12 +203,15 @@ void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag, guint rspamd_url_hash (gconstpointer u); guint rspamd_email_hash (gconstpointer u); +guint rspamd_url_host_hash (gconstpointer u); + /* Compare two emails for building emails hash */ gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b); /* Compare two urls for building emails hash */ gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b); +gboolean rspamd_urls_host_cmp (gconstpointer a, gconstpointer b); /** * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated -- 2.39.5