From fe6321a9496a3ddbb0d4f04e8cd89df34d9241fa Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 23 Sep 2015 16:37:04 +0100 Subject: [PATCH] Allow to extract URLs from query strings of other URLs. Issue: #361 Reported by: @socksrambler --- src/libserver/url.c | 59 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/src/libserver/url.c b/src/libserver/url.c index b2e5d691c..95773baa0 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1655,7 +1655,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, { gint rc, state = 0; gchar *url_str = NULL; - struct rspamd_url *new; + struct rspamd_url *url; struct process_exception *ex; const gchar *p, *end, *begin, *url_start, *url_end; @@ -1671,34 +1671,69 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, is_html, &state)) { if (url_str != NULL) { - new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); + url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); ex = rspamd_mempool_alloc0 (pool, sizeof (struct process_exception)); - if (new != NULL) { + if (url != NULL) { g_strstrip (url_str); - rc = rspamd_url_parse (new, url_str, strlen (url_str), + rc = rspamd_url_parse (url, url_str, strlen (url_str), pool); if (rc == URI_ERRNO_OK && - new->hostlen > 0) { + url->hostlen > 0) { ex->pos = url_start - begin; ex->len = url_end - url_start; - if (new->protocol == PROTOCOL_MAILTO) { - if (new->userlen > 0) { - if (!g_hash_table_lookup (task->emails, new)) { - g_hash_table_insert (task->emails, new, - new); + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen > 0) { + if (!g_hash_table_lookup (task->emails, url)) { + g_hash_table_insert (task->emails, url, + url); } } } else { - if (!g_hash_table_lookup (task->urls, new)) { - g_hash_table_insert (task->urls, new, new); + if (!g_hash_table_lookup (task->urls, url)) { + g_hash_table_insert (task->urls, url, url); } } part->urls_offset = g_list_prepend ( part->urls_offset, ex); + + /* We also search the query for additional url inside */ + if (url->querylen > 0) { + gint nstate = 0; + struct rspamd_url *query_url; + + if (rspamd_url_find (pool, + url->query, + url->querylen, + NULL, + NULL, + &url_str, + is_html, + &nstate)) { + + query_url = rspamd_mempool_alloc0 (pool, + sizeof (struct rspamd_url)); + rc = rspamd_url_parse (query_url, + url_str, + strlen (url_str), + pool); + if (rc == URI_ERRNO_OK && + url->hostlen > 0) { + msg_debug_task ("found url %s in query of url" + " %*s", url_str, url->querylen, url->query); + + if (!g_hash_table_lookup (task->urls, + query_url)) { + g_hash_table_insert (task->urls, + query_url, + query_url); + } + } + } + } } else if (rc != URI_ERRNO_OK) { msg_info_task ("extract of url '%s' failed: %s", -- 2.39.5