]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Add heuristic to find displayed URLs
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 13 Feb 2017 15:27:04 +0000 (15:27 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 13 Feb 2017 15:27:04 +0000 (15:27 +0000)
src/libserver/html.c
src/libserver/url.c
src/libserver/url.h

index 785951d661d478f180efa2020e0db0b6ac694699..72e8cbd9d2713b49d2de96df2b6cac21003bf678 100644 (file)
@@ -765,16 +765,23 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
        struct rspamd_url *href_url,
        const guchar *url_text,
        gsize len,
-       gboolean *url_found)
+       gboolean *url_found,
+       struct rspamd_url **ptext_url)
 {
        struct rspamd_url *text_url;
        rspamd_ftok_t phished_tld;
        gint rc;
        gchar *url_str = NULL;
+       const guchar *end = url_text + len;
 
        *url_found = FALSE;
 
-       if (rspamd_url_find (pool, url_text, len, &url_str, TRUE) && url_str != NULL) {
+       while (url_text < end && g_ascii_isspace (*url_text)) {
+               url_text ++;
+       }
+
+       if (rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE) &&
+                       url_str != NULL) {
                text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
                rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool);
 
@@ -791,9 +798,11 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
                                        rspamd_url_add_tag (text_url, "phishing",
                                                        rspamd_mempool_ftokdup (pool, &phished_tld),
                                                        pool);
+                                       text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
                                }
                        }
 
+                       *ptext_url = text_url;
                        *url_found = TRUE;
                }
                else {
@@ -2124,10 +2133,13 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                        (cur_tag->flags & FL_CLOSING)) {
                                                /* Insert exception */
                                                if (url != NULL && (gint)dest->len > href_offset) {
+                                                       struct rspamd_url *displayed_url = NULL;
+
                                                        rspamd_html_url_is_phished (pool, url,
                                                                        dest->data + href_offset,
                                                                        dest->len - href_offset,
-                                                                       &url_text);
+                                                                       &url_text, &displayed_url);
+
                                                        if (exceptions && url_text) {
                                                                ex = rspamd_mempool_alloc (pool, sizeof (*ex));
                                                                ex->pos = href_offset;
@@ -2136,6 +2148,32 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
 
                                                                *exceptions = g_list_prepend (*exceptions, ex);
                                                        }
+
+                                                       if (displayed_url) {
+                                                               if (url->protocol == PROTOCOL_MAILTO) {
+                                                                       target_tbl = emails;
+                                                               }
+                                                               else {
+                                                                       target_tbl = urls;
+                                                               }
+
+                                                               if (target_tbl != NULL) {
+                                                                       turl = g_hash_table_lookup (target_tbl, url);
+
+                                                                       if (turl != NULL) {
+                                                                               /* Here, we assume the following:
+                                                                                * if we have a URL in the text part which
+                                                                                * is the same as displayed URL in the
+                                                                                * HTML part, we assume that it is also
+                                                                                * hint only.
+                                                                                */
+                                                                               if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
+                                                                                       turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+                                                                                       turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
+                                                                               }
+                                                                       }
+                                                               }
+                                                       }
                                                }
 
                                                href_offset = -1;
index 419b7e84afc960ad6d7bff49d8fbf53b0d1cb31b..c4c5fd038456acee1640c21b7171a1d98f1d30a6 100644 (file)
@@ -2364,6 +2364,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
        if (url->protocol == PROTOCOL_MAILTO) {
                if (url->userlen > 0) {
                        if (!g_hash_table_lookup (task->emails, url)) {
+                               url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
                                g_hash_table_insert (task->emails, url,
                                                url);
                        }
@@ -2371,6 +2372,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
        }
        else {
                if (!g_hash_table_lookup (task->urls, url)) {
+                       url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
                        g_hash_table_insert (task->urls, url, url);
                }
        }
@@ -2401,6 +2403,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
 
                                if (!g_hash_table_lookup (task->urls,
                                                query_url)) {
+                                       query_url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
                                        g_hash_table_insert (task->urls,
                                                        query_url,
                                                        query_url);
index f56649558b45b33c7acb3dddb3d0de1fe9bb42a3..3fab46c5ea5145d6000d7619cf2db41b74708d02 100644 (file)
@@ -14,6 +14,8 @@ enum rspamd_url_flags {
        RSPAMD_URL_FLAG_NUMERIC = 1 << 1,
        RSPAMD_URL_FLAG_OBSCURED = 1 << 2,
        RSPAMD_URL_FLAG_REDIRECTED = 1 << 3,
+       RSPAMD_URL_FLAG_HTML_DISPLAYED = 1 << 4,
+       RSPAMD_URL_FLAG_FROM_TEXT = 1 << 5,
 };
 
 struct rspamd_url_tag {