diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-09-23 16:37:04 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-09-23 16:37:04 +0100 |
commit | fe6321a9496a3ddbb0d4f04e8cd89df34d9241fa (patch) | |
tree | 2b63acecd964c3287a2091e3af860328e13b59ff | |
parent | 45b3b5ddf56921037e545be8283b43e9af88a9d3 (diff) | |
download | rspamd-fe6321a9496a3ddbb0d4f04e8cd89df34d9241fa.tar.gz rspamd-fe6321a9496a3ddbb0d4f04e8cd89df34d9241fa.zip |
Allow to extract URLs from query strings of other URLs.
Issue: #361
Reported by: @socksrambler
-rw-r--r-- | src/libserver/url.c | 59 |
1 files changed, 47 insertions, 12 deletions
diff --git a/src/libserver/url.c b/src/libserver/url.c index b2e5d691c..95773baa0 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1655,7 +1655,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, { gint rc, state = 0; gchar *url_str = NULL; - struct rspamd_url *new; + struct rspamd_url *url; struct process_exception *ex; const gchar *p, *end, *begin, *url_start, *url_end; @@ -1671,34 +1671,69 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, is_html, &state)) { if (url_str != NULL) { - new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); + url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); ex = rspamd_mempool_alloc0 (pool, sizeof (struct process_exception)); - if (new != NULL) { + if (url != NULL) { g_strstrip (url_str); - rc = rspamd_url_parse (new, url_str, strlen (url_str), + rc = rspamd_url_parse (url, url_str, strlen (url_str), pool); if (rc == URI_ERRNO_OK && - new->hostlen > 0) { + url->hostlen > 0) { ex->pos = url_start - begin; ex->len = url_end - url_start; - if (new->protocol == PROTOCOL_MAILTO) { - if (new->userlen > 0) { - if (!g_hash_table_lookup (task->emails, new)) { - g_hash_table_insert (task->emails, new, - new); + if (url->protocol == PROTOCOL_MAILTO) { + if (url->userlen > 0) { + if (!g_hash_table_lookup (task->emails, url)) { + g_hash_table_insert (task->emails, url, + url); } } } else { - if (!g_hash_table_lookup (task->urls, new)) { - g_hash_table_insert (task->urls, new, new); + if (!g_hash_table_lookup (task->urls, url)) { + g_hash_table_insert (task->urls, url, url); } } part->urls_offset = g_list_prepend ( part->urls_offset, ex); + + /* We also search the query for additional url inside */ + if (url->querylen > 0) { + gint nstate = 0; + struct rspamd_url *query_url; + + if (rspamd_url_find (pool, + url->query, + url->querylen, + NULL, + NULL, + &url_str, + is_html, + &nstate)) { + + query_url = rspamd_mempool_alloc0 (pool, + sizeof (struct rspamd_url)); + rc = rspamd_url_parse (query_url, + url_str, + strlen (url_str), + pool); + if (rc == URI_ERRNO_OK && + url->hostlen > 0) { + msg_debug_task ("found url %s in query of url" + " %*s", url_str, url->querylen, url->query); + + if (!g_hash_table_lookup (task->urls, + query_url)) { + g_hash_table_insert (task->urls, + query_url, + query_url); + } + } + } + } } else if (rc != URI_ERRNO_OK) { msg_info_task ("extract of url '%s' failed: %s", |