From: Vsevolod Stakhov Date: Mon, 13 Feb 2017 15:27:04 +0000 (+0000) Subject: [Feature] Add heuristic to find displayed URLs X-Git-Tag: 1.5.0~113 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=5244d25f3b57d7c46ba82413aedc6116d6cb294c;p=rspamd.git [Feature] Add heuristic to find displayed URLs --- diff --git a/src/libserver/html.c b/src/libserver/html.c index 785951d66..72e8cbd9d 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -765,16 +765,23 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool, struct rspamd_url *href_url, const guchar *url_text, gsize len, - gboolean *url_found) + gboolean *url_found, + struct rspamd_url **ptext_url) { struct rspamd_url *text_url; rspamd_ftok_t phished_tld; gint rc; gchar *url_str = NULL; + const guchar *end = url_text + len; *url_found = FALSE; - if (rspamd_url_find (pool, url_text, len, &url_str, TRUE) && url_str != NULL) { + while (url_text < end && g_ascii_isspace (*url_text)) { + url_text ++; + } + + if (rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE) && + url_str != NULL) { text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool); @@ -791,9 +798,11 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool, rspamd_url_add_tag (text_url, "phishing", rspamd_mempool_ftokdup (pool, &phished_tld), pool); + text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; } } + *ptext_url = text_url; *url_found = TRUE; } else { @@ -2124,10 +2133,13 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, (cur_tag->flags & FL_CLOSING)) { /* Insert exception */ if (url != NULL && (gint)dest->len > href_offset) { + struct rspamd_url *displayed_url = NULL; + rspamd_html_url_is_phished (pool, url, dest->data + href_offset, dest->len - href_offset, - &url_text); + &url_text, &displayed_url); + if (exceptions && url_text) { ex = rspamd_mempool_alloc (pool, sizeof (*ex)); ex->pos = href_offset; @@ -2136,6 +2148,32 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, *exceptions = g_list_prepend (*exceptions, ex); } + + if (displayed_url) { + if (url->protocol == PROTOCOL_MAILTO) { + target_tbl = emails; + } + else { + target_tbl = urls; + } + + if (target_tbl != NULL) { + turl = g_hash_table_lookup (target_tbl, url); + + if (turl != NULL) { + /* Here, we assume the following: + * if we have a URL in the text part which + * is the same as displayed URL in the + * HTML part, we assume that it is also + * hint only. + */ + if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) { + turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; + turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT; + } + } + } + } } href_offset = -1; diff --git a/src/libserver/url.c b/src/libserver/url.c index 419b7e84a..c4c5fd038 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2364,6 +2364,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, if (url->protocol == PROTOCOL_MAILTO) { if (url->userlen > 0) { if (!g_hash_table_lookup (task->emails, url)) { + url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; g_hash_table_insert (task->emails, url, url); } @@ -2371,6 +2372,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, } else { if (!g_hash_table_lookup (task->urls, url)) { + url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; g_hash_table_insert (task->urls, url, url); } } @@ -2401,6 +2403,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, if (!g_hash_table_lookup (task->urls, query_url)) { + query_url->flags |= RSPAMD_URL_FLAG_FROM_TEXT; g_hash_table_insert (task->urls, query_url, query_url); diff --git a/src/libserver/url.h b/src/libserver/url.h index f56649558..3fab46c5e 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -14,6 +14,8 @@ enum rspamd_url_flags { RSPAMD_URL_FLAG_NUMERIC = 1 << 1, RSPAMD_URL_FLAG_OBSCURED = 1 << 2, RSPAMD_URL_FLAG_REDIRECTED = 1 << 3, + RSPAMD_URL_FLAG_HTML_DISPLAYED = 1 << 4, + RSPAMD_URL_FLAG_FROM_TEXT = 1 << 5, }; struct rspamd_url_tag {